DataCamp offers several interactive courses related to R Programming. While much of it is review, it is always helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
There are a few nuggest from within these beginning modules, including:
Below is some sample code showing examples for the generic statements:
# Factors
xRaw = c("High", "High", "Low", "Low", "Medium", "Very High", "Low")
xFactorNon = factor(xRaw, levels=c("Low", "Medium", "High", "Very High"))
xFactorNon
## [1] High High Low Low Medium Very High Low
## Levels: Low Medium High Very High
xFactorNon[xFactorNon == "High"] > xFactorNon[xFactorNon == "Low"][1]
## Warning in Ops.factor(xFactorNon[xFactorNon == "High"],
## xFactorNon[xFactorNon == : '>' not meaningful for factors
## [1] NA NA
xFactorOrder = factor(xRaw, ordered=TRUE, levels=c("Low", "Medium", "High", "Very High"))
xFactorOrder
## [1] High High Low Low Medium Very High Low
## Levels: Low < Medium < High < Very High
xFactorOrder[xFactorOrder == "High"] > xFactorOrder[xFactorOrder == "Low"][1]
## [1] TRUE TRUE
# Subsets
data(mtcars)
subset(mtcars, mpg>=25)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
identical(subset(mtcars, mpg>=25), mtcars[mtcars$mpg>=25, ])
## [1] TRUE
subset(mtcars, mpg>25, select=c("mpg", "cyl", "disp"))
## mpg cyl disp
## Fiat 128 32.4 4 78.7
## Honda Civic 30.4 4 75.7
## Toyota Corolla 33.9 4 71.1
## Fiat X1-9 27.3 4 79.0
## Porsche 914-2 26.0 4 120.3
## Lotus Europa 30.4 4 95.1
# & and && (same as | and ||)
compA <- c(2, 3, 4, 1, 2, 3)
compB <- c(1, 2, 3, 4, 5, 6)
(compA > compB) & (compA + compB < 6)
## [1] TRUE TRUE FALSE FALSE FALSE FALSE
(compA > compB) | (compA + compB < 6)
## [1] TRUE TRUE TRUE TRUE FALSE FALSE
(compA > compB) && (compA + compB < 6)
## [1] TRUE
(compA > compB) || (compA + compB < 6)
## [1] TRUE
# Loops and cat()
# for (a in b) {
# do stuff
# if (exitCond) { break }
# if (nextCond) { next }
# do some more stuff
# }
for (myVal in compA*compB) {
print(paste0("myVal is: ", myVal))
if ((myVal %% 3) == 0) { cat("Divisible by 3, not happy about that\n\n"); next }
print("That is not divisible by 3")
if ((myVal %% 5) == 0) { cat("Exiting due to divisible by 5 but not divisible by 3\n\n"); break }
cat("Onwards and upwards\n\n")
}
## [1] "myVal is: 2"
## [1] "That is not divisible by 3"
## Onwards and upwards
##
## [1] "myVal is: 6"
## Divisible by 3, not happy about that
##
## [1] "myVal is: 12"
## Divisible by 3, not happy about that
##
## [1] "myVal is: 4"
## [1] "That is not divisible by 3"
## Onwards and upwards
##
## [1] "myVal is: 10"
## [1] "That is not divisible by 3"
## Exiting due to divisible by 5 but not divisible by 3
# args() and search()
args(plot.default)
## function (x, y = NULL, type = "p", xlim = NULL, ylim = NULL,
## log = "", main = NULL, sub = NULL, xlab = NULL, ylab = NULL,
## ann = par("ann"), axes = TRUE, frame.plot = axes, panel.first = NULL,
## panel.last = NULL, asp = NA, ...)
## NULL
search()
## [1] ".GlobalEnv" "package:stats" "package:graphics"
## [4] "package:grDevices" "package:utils" "package:datasets"
## [7] "package:methods" "Autoloads" "package:base"
# unique()
compA
## [1] 2 3 4 1 2 3
unique(compA)
## [1] 2 3 4 1
# unlist()
listA <- as.list(compA)
unlist(listA)
## [1] 2 3 4 1 2 3
identical(compA, unlist(listA))
## [1] TRUE
# sort()
sort(mtcars$mpg)
## [1] 10.4 10.4 13.3 14.3 14.7 15.0 15.2 15.2 15.5 15.8 16.4 17.3 17.8 18.1
## [15] 18.7 19.2 19.2 19.7 21.0 21.0 21.4 21.4 21.5 22.8 22.8 24.4 26.0 27.3
## [29] 30.4 30.4 32.4 33.9
sort(mtcars$mpg, decreasing=TRUE)
## [1] 33.9 32.4 30.4 30.4 27.3 26.0 24.4 22.8 22.8 21.5 21.4 21.4 21.0 21.0
## [15] 19.7 19.2 19.2 18.7 18.1 17.8 17.3 16.4 15.8 15.5 15.2 15.2 15.0 14.7
## [29] 14.3 13.3 10.4 10.4
# rep()
rep(1:6, times=2) # 1:6 followed by 1:6
## [1] 1 2 3 4 5 6 1 2 3 4 5 6
rep(1:6, each=2) # 1 1 2 2 3 3 4 4 5 5 6 6
## [1] 1 1 2 2 3 3 4 4 5 5 6 6
rep(1:6, times=2, each=3) # 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 repeated twice (each comes first)
## [1] 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6
## [36] 6
rep(1:6, times=6:1) # 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 4 4 4 5 5 6
## [1] 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 4 4 4 5 5 6
# append()
myWords <- c("The", "cat", "in", "the", "hat")
paste(append(myWords, c("is", "fun", "to", "read")), collapse=" ")
## [1] "The cat in the hat is fun to read"
paste(append(myWords, "funny", 4), collapse=" ")
## [1] "The cat in the funny hat"
# grep("//1")
sampMsg <- "This is from myname@subdomain.mydomain.com again"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\1", sampMsg)
## [1] "This is from myname@"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\2", sampMsg)
## [1] "subdomain.mydomain.com"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\3", sampMsg)
## [1] " again"
# rev()
compA
## [1] 2 3 4 1 2 3
rev(compA)
## [1] 3 2 1 4 3 2
Below is some sample code showing examples for the apply statements:
# lapply
args(lapply)
## function (X, FUN, ...)
## NULL
lapply(1:5, FUN=sqrt)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 1.414214
##
## [[3]]
## [1] 1.732051
##
## [[4]]
## [1] 2
##
## [[5]]
## [1] 2.236068
lapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## x y pow
## 4 3 64
##
## [[5]]
## x y pow
## 5 3 125
lapply(1:5, FUN=function(x, y=2) { if (x <= 3) {c(x=x, y=y, pow=x^y) } else { c(pow=x^y) } }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## pow
## 64
##
## [[5]]
## pow
## 125
# sapply (defaults to returning a named vector/array if possible; is lapply otherwise)
args(sapply)
## function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)
## NULL
args(simplify2array)
## function (x, higher = TRUE)
## NULL
sapply(1:5, FUN=sqrt)
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068
sapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, y=3)
## [,1] [,2] [,3] [,4] [,5]
## x 1 2 3 4 5
## y 3 3 3 3 3
## pow 1 8 27 64 125
sapply(1:5, FUN=function(x, y=2) { if (x <= 3) {c(x=x, y=y, pow=x^y) } else { c(pow=x^y) } }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## pow
## 64
##
## [[5]]
## pow
## 125
# vapply (tells sapply exactly what should be returned; errors out otherwise)
args(vapply)
## function (X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)
## NULL
vapply(1:5, FUN=sqrt, FUN.VALUE=numeric(1))
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068
vapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, FUN.VALUE=numeric(3), y=3)
## [,1] [,2] [,3] [,4] [,5]
## x 1 2 3 4 5
## y 3 3 3 3 3
## pow 1 8 27 64 125
Below is some sample code for handing dates and times in R:
Sys.Date()
## [1] "2016-08-04"
Sys.time()
## [1] "2016-08-04 08:12:23 CDT"
args(strptime)
## function (x, format, tz = "")
## NULL
rightNow <- as.POSIXct(Sys.time())
format(rightNow, "%Y**%M-%d %H hours and %M minutes", usetz=TRUE)
## [1] "2016**12-04 08 hours and 12 minutes CDT"
lastChristmasNoon <- as.POSIXct("2015-12-25 12:00:00", format="%Y-%m-%d %X")
rightNow - lastChristmasNoon
## Time difference of 222.8003 days
nextUMHomeGame <- as.POSIXct("16/SEP/3 12:00:00", format="%y/%b/%d %H:%M:%S", tz="America/Detroit")
nextUMHomeGame - rightNow
## Time difference of 30.11639 days
# Time zones available in R
OlsonNames()
## [1] "Africa/Abidjan" "Africa/Accra"
## [3] "Africa/Addis_Ababa" "Africa/Algiers"
## [5] "Africa/Asmara" "Africa/Asmera"
## [7] "Africa/Bamako" "Africa/Bangui"
## [9] "Africa/Banjul" "Africa/Bissau"
## [11] "Africa/Blantyre" "Africa/Brazzaville"
## [13] "Africa/Bujumbura" "Africa/Cairo"
## [15] "Africa/Casablanca" "Africa/Ceuta"
## [17] "Africa/Conakry" "Africa/Dakar"
## [19] "Africa/Dar_es_Salaam" "Africa/Djibouti"
## [21] "Africa/Douala" "Africa/El_Aaiun"
## [23] "Africa/Freetown" "Africa/Gaborone"
## [25] "Africa/Harare" "Africa/Johannesburg"
## [27] "Africa/Juba" "Africa/Kampala"
## [29] "Africa/Khartoum" "Africa/Kigali"
## [31] "Africa/Kinshasa" "Africa/Lagos"
## [33] "Africa/Libreville" "Africa/Lome"
## [35] "Africa/Luanda" "Africa/Lubumbashi"
## [37] "Africa/Lusaka" "Africa/Malabo"
## [39] "Africa/Maputo" "Africa/Maseru"
## [41] "Africa/Mbabane" "Africa/Mogadishu"
## [43] "Africa/Monrovia" "Africa/Nairobi"
## [45] "Africa/Ndjamena" "Africa/Niamey"
## [47] "Africa/Nouakchott" "Africa/Ouagadougou"
## [49] "Africa/Porto-Novo" "Africa/Sao_Tome"
## [51] "Africa/Timbuktu" "Africa/Tripoli"
## [53] "Africa/Tunis" "Africa/Windhoek"
## [55] "America/Adak" "America/Anchorage"
## [57] "America/Anguilla" "America/Antigua"
## [59] "America/Araguaina" "America/Argentina/Buenos_Aires"
## [61] "America/Argentina/Catamarca" "America/Argentina/ComodRivadavia"
## [63] "America/Argentina/Cordoba" "America/Argentina/Jujuy"
## [65] "America/Argentina/La_Rioja" "America/Argentina/Mendoza"
## [67] "America/Argentina/Rio_Gallegos" "America/Argentina/Salta"
## [69] "America/Argentina/San_Juan" "America/Argentina/San_Luis"
## [71] "America/Argentina/Tucuman" "America/Argentina/Ushuaia"
## [73] "America/Aruba" "America/Asuncion"
## [75] "America/Atikokan" "America/Atka"
## [77] "America/Bahia" "America/Bahia_Banderas"
## [79] "America/Barbados" "America/Belem"
## [81] "America/Belize" "America/Blanc-Sablon"
## [83] "America/Boa_Vista" "America/Bogota"
## [85] "America/Boise" "America/Buenos_Aires"
## [87] "America/Cambridge_Bay" "America/Campo_Grande"
## [89] "America/Cancun" "America/Caracas"
## [91] "America/Catamarca" "America/Cayenne"
## [93] "America/Cayman" "America/Chicago"
## [95] "America/Chihuahua" "America/Coral_Harbour"
## [97] "America/Cordoba" "America/Costa_Rica"
## [99] "America/Creston" "America/Cuiaba"
## [101] "America/Curacao" "America/Danmarkshavn"
## [103] "America/Dawson" "America/Dawson_Creek"
## [105] "America/Denver" "America/Detroit"
## [107] "America/Dominica" "America/Edmonton"
## [109] "America/Eirunepe" "America/El_Salvador"
## [111] "America/Ensenada" "America/Fort_Nelson"
## [113] "America/Fort_Wayne" "America/Fortaleza"
## [115] "America/Glace_Bay" "America/Godthab"
## [117] "America/Goose_Bay" "America/Grand_Turk"
## [119] "America/Grenada" "America/Guadeloupe"
## [121] "America/Guatemala" "America/Guayaquil"
## [123] "America/Guyana" "America/Halifax"
## [125] "America/Havana" "America/Hermosillo"
## [127] "America/Indiana/Indianapolis" "America/Indiana/Knox"
## [129] "America/Indiana/Marengo" "America/Indiana/Petersburg"
## [131] "America/Indiana/Tell_City" "America/Indiana/Vevay"
## [133] "America/Indiana/Vincennes" "America/Indiana/Winamac"
## [135] "America/Indianapolis" "America/Inuvik"
## [137] "America/Iqaluit" "America/Jamaica"
## [139] "America/Jujuy" "America/Juneau"
## [141] "America/Kentucky/Louisville" "America/Kentucky/Monticello"
## [143] "America/Knox_IN" "America/Kralendijk"
## [145] "America/La_Paz" "America/Lima"
## [147] "America/Los_Angeles" "America/Louisville"
## [149] "America/Lower_Princes" "America/Maceio"
## [151] "America/Managua" "America/Manaus"
## [153] "America/Marigot" "America/Martinique"
## [155] "America/Matamoros" "America/Mazatlan"
## [157] "America/Mendoza" "America/Menominee"
## [159] "America/Merida" "America/Metlakatla"
## [161] "America/Mexico_City" "America/Miquelon"
## [163] "America/Moncton" "America/Monterrey"
## [165] "America/Montevideo" "America/Montreal"
## [167] "America/Montserrat" "America/Nassau"
## [169] "America/New_York" "America/Nipigon"
## [171] "America/Nome" "America/Noronha"
## [173] "America/North_Dakota/Beulah" "America/North_Dakota/Center"
## [175] "America/North_Dakota/New_Salem" "America/Ojinaga"
## [177] "America/Panama" "America/Pangnirtung"
## [179] "America/Paramaribo" "America/Phoenix"
## [181] "America/Port-au-Prince" "America/Port_of_Spain"
## [183] "America/Porto_Acre" "America/Porto_Velho"
## [185] "America/Puerto_Rico" "America/Rainy_River"
## [187] "America/Rankin_Inlet" "America/Recife"
## [189] "America/Regina" "America/Resolute"
## [191] "America/Rio_Branco" "America/Rosario"
## [193] "America/Santa_Isabel" "America/Santarem"
## [195] "America/Santiago" "America/Santo_Domingo"
## [197] "America/Sao_Paulo" "America/Scoresbysund"
## [199] "America/Shiprock" "America/Sitka"
## [201] "America/St_Barthelemy" "America/St_Johns"
## [203] "America/St_Kitts" "America/St_Lucia"
## [205] "America/St_Thomas" "America/St_Vincent"
## [207] "America/Swift_Current" "America/Tegucigalpa"
## [209] "America/Thule" "America/Thunder_Bay"
## [211] "America/Tijuana" "America/Toronto"
## [213] "America/Tortola" "America/Vancouver"
## [215] "America/Virgin" "America/Whitehorse"
## [217] "America/Winnipeg" "America/Yakutat"
## [219] "America/Yellowknife" "Antarctica/Casey"
## [221] "Antarctica/Davis" "Antarctica/DumontDUrville"
## [223] "Antarctica/Macquarie" "Antarctica/Mawson"
## [225] "Antarctica/McMurdo" "Antarctica/Palmer"
## [227] "Antarctica/Rothera" "Antarctica/South_Pole"
## [229] "Antarctica/Syowa" "Antarctica/Troll"
## [231] "Antarctica/Vostok" "Arctic/Longyearbyen"
## [233] "Asia/Aden" "Asia/Almaty"
## [235] "Asia/Amman" "Asia/Anadyr"
## [237] "Asia/Aqtau" "Asia/Aqtobe"
## [239] "Asia/Ashgabat" "Asia/Ashkhabad"
## [241] "Asia/Baghdad" "Asia/Bahrain"
## [243] "Asia/Baku" "Asia/Bangkok"
## [245] "Asia/Beirut" "Asia/Bishkek"
## [247] "Asia/Brunei" "Asia/Calcutta"
## [249] "Asia/Chita" "Asia/Choibalsan"
## [251] "Asia/Chongqing" "Asia/Chungking"
## [253] "Asia/Colombo" "Asia/Dacca"
## [255] "Asia/Damascus" "Asia/Dhaka"
## [257] "Asia/Dili" "Asia/Dubai"
## [259] "Asia/Dushanbe" "Asia/Gaza"
## [261] "Asia/Harbin" "Asia/Hebron"
## [263] "Asia/Ho_Chi_Minh" "Asia/Hong_Kong"
## [265] "Asia/Hovd" "Asia/Irkutsk"
## [267] "Asia/Istanbul" "Asia/Jakarta"
## [269] "Asia/Jayapura" "Asia/Jerusalem"
## [271] "Asia/Kabul" "Asia/Kamchatka"
## [273] "Asia/Karachi" "Asia/Kashgar"
## [275] "Asia/Kathmandu" "Asia/Katmandu"
## [277] "Asia/Khandyga" "Asia/Kolkata"
## [279] "Asia/Krasnoyarsk" "Asia/Kuala_Lumpur"
## [281] "Asia/Kuching" "Asia/Kuwait"
## [283] "Asia/Macao" "Asia/Macau"
## [285] "Asia/Magadan" "Asia/Makassar"
## [287] "Asia/Manila" "Asia/Muscat"
## [289] "Asia/Nicosia" "Asia/Novokuznetsk"
## [291] "Asia/Novosibirsk" "Asia/Omsk"
## [293] "Asia/Oral" "Asia/Phnom_Penh"
## [295] "Asia/Pontianak" "Asia/Pyongyang"
## [297] "Asia/Qatar" "Asia/Qyzylorda"
## [299] "Asia/Rangoon" "Asia/Riyadh"
## [301] "Asia/Saigon" "Asia/Sakhalin"
## [303] "Asia/Samarkand" "Asia/Seoul"
## [305] "Asia/Shanghai" "Asia/Singapore"
## [307] "Asia/Srednekolymsk" "Asia/Taipei"
## [309] "Asia/Tashkent" "Asia/Tbilisi"
## [311] "Asia/Tehran" "Asia/Tel_Aviv"
## [313] "Asia/Thimbu" "Asia/Thimphu"
## [315] "Asia/Tokyo" "Asia/Ujung_Pandang"
## [317] "Asia/Ulaanbaatar" "Asia/Ulan_Bator"
## [319] "Asia/Urumqi" "Asia/Ust-Nera"
## [321] "Asia/Vientiane" "Asia/Vladivostok"
## [323] "Asia/Yakutsk" "Asia/Yekaterinburg"
## [325] "Asia/Yerevan" "Atlantic/Azores"
## [327] "Atlantic/Bermuda" "Atlantic/Canary"
## [329] "Atlantic/Cape_Verde" "Atlantic/Faeroe"
## [331] "Atlantic/Faroe" "Atlantic/Jan_Mayen"
## [333] "Atlantic/Madeira" "Atlantic/Reykjavik"
## [335] "Atlantic/South_Georgia" "Atlantic/St_Helena"
## [337] "Atlantic/Stanley" "Australia/ACT"
## [339] "Australia/Adelaide" "Australia/Brisbane"
## [341] "Australia/Broken_Hill" "Australia/Canberra"
## [343] "Australia/Currie" "Australia/Darwin"
## [345] "Australia/Eucla" "Australia/Hobart"
## [347] "Australia/LHI" "Australia/Lindeman"
## [349] "Australia/Lord_Howe" "Australia/Melbourne"
## [351] "Australia/North" "Australia/NSW"
## [353] "Australia/Perth" "Australia/Queensland"
## [355] "Australia/South" "Australia/Sydney"
## [357] "Australia/Tasmania" "Australia/Victoria"
## [359] "Australia/West" "Australia/Yancowinna"
## [361] "Brazil/Acre" "Brazil/DeNoronha"
## [363] "Brazil/East" "Brazil/West"
## [365] "Canada/Atlantic" "Canada/Central"
## [367] "Canada/East-Saskatchewan" "Canada/Eastern"
## [369] "Canada/Mountain" "Canada/Newfoundland"
## [371] "Canada/Pacific" "Canada/Saskatchewan"
## [373] "Canada/Yukon" "CET"
## [375] "Chile/Continental" "Chile/EasterIsland"
## [377] "CST6CDT" "Cuba"
## [379] "EET" "Egypt"
## [381] "Eire" "EST"
## [383] "EST5EDT" "Etc/GMT"
## [385] "Etc/GMT-0" "Etc/GMT-1"
## [387] "Etc/GMT-10" "Etc/GMT-11"
## [389] "Etc/GMT-12" "Etc/GMT-13"
## [391] "Etc/GMT-14" "Etc/GMT-2"
## [393] "Etc/GMT-3" "Etc/GMT-4"
## [395] "Etc/GMT-5" "Etc/GMT-6"
## [397] "Etc/GMT-7" "Etc/GMT-8"
## [399] "Etc/GMT-9" "Etc/GMT+0"
## [401] "Etc/GMT+1" "Etc/GMT+10"
## [403] "Etc/GMT+11" "Etc/GMT+12"
## [405] "Etc/GMT+2" "Etc/GMT+3"
## [407] "Etc/GMT+4" "Etc/GMT+5"
## [409] "Etc/GMT+6" "Etc/GMT+7"
## [411] "Etc/GMT+8" "Etc/GMT+9"
## [413] "Etc/GMT0" "Etc/Greenwich"
## [415] "Etc/UCT" "Etc/Universal"
## [417] "Etc/UTC" "Etc/Zulu"
## [419] "Europe/Amsterdam" "Europe/Andorra"
## [421] "Europe/Athens" "Europe/Belfast"
## [423] "Europe/Belgrade" "Europe/Berlin"
## [425] "Europe/Bratislava" "Europe/Brussels"
## [427] "Europe/Bucharest" "Europe/Budapest"
## [429] "Europe/Busingen" "Europe/Chisinau"
## [431] "Europe/Copenhagen" "Europe/Dublin"
## [433] "Europe/Gibraltar" "Europe/Guernsey"
## [435] "Europe/Helsinki" "Europe/Isle_of_Man"
## [437] "Europe/Istanbul" "Europe/Jersey"
## [439] "Europe/Kaliningrad" "Europe/Kiev"
## [441] "Europe/Lisbon" "Europe/Ljubljana"
## [443] "Europe/London" "Europe/Luxembourg"
## [445] "Europe/Madrid" "Europe/Malta"
## [447] "Europe/Mariehamn" "Europe/Minsk"
## [449] "Europe/Monaco" "Europe/Moscow"
## [451] "Europe/Nicosia" "Europe/Oslo"
## [453] "Europe/Paris" "Europe/Podgorica"
## [455] "Europe/Prague" "Europe/Riga"
## [457] "Europe/Rome" "Europe/Samara"
## [459] "Europe/San_Marino" "Europe/Sarajevo"
## [461] "Europe/Simferopol" "Europe/Skopje"
## [463] "Europe/Sofia" "Europe/Stockholm"
## [465] "Europe/Tallinn" "Europe/Tirane"
## [467] "Europe/Tiraspol" "Europe/Uzhgorod"
## [469] "Europe/Vaduz" "Europe/Vatican"
## [471] "Europe/Vienna" "Europe/Vilnius"
## [473] "Europe/Volgograd" "Europe/Warsaw"
## [475] "Europe/Zagreb" "Europe/Zaporozhye"
## [477] "Europe/Zurich" "GB"
## [479] "GB-Eire" "GMT"
## [481] "GMT-0" "GMT+0"
## [483] "GMT0" "Greenwich"
## [485] "Hongkong" "HST"
## [487] "Iceland" "Indian/Antananarivo"
## [489] "Indian/Chagos" "Indian/Christmas"
## [491] "Indian/Cocos" "Indian/Comoro"
## [493] "Indian/Kerguelen" "Indian/Mahe"
## [495] "Indian/Maldives" "Indian/Mauritius"
## [497] "Indian/Mayotte" "Indian/Reunion"
## [499] "Iran" "Israel"
## [501] "Jamaica" "Japan"
## [503] "Kwajalein" "Libya"
## [505] "MET" "Mexico/BajaNorte"
## [507] "Mexico/BajaSur" "Mexico/General"
## [509] "MST" "MST7MDT"
## [511] "Navajo" "NZ"
## [513] "NZ-CHAT" "Pacific/Apia"
## [515] "Pacific/Auckland" "Pacific/Bougainville"
## [517] "Pacific/Chatham" "Pacific/Chuuk"
## [519] "Pacific/Easter" "Pacific/Efate"
## [521] "Pacific/Enderbury" "Pacific/Fakaofo"
## [523] "Pacific/Fiji" "Pacific/Funafuti"
## [525] "Pacific/Galapagos" "Pacific/Gambier"
## [527] "Pacific/Guadalcanal" "Pacific/Guam"
## [529] "Pacific/Honolulu" "Pacific/Johnston"
## [531] "Pacific/Kiritimati" "Pacific/Kosrae"
## [533] "Pacific/Kwajalein" "Pacific/Majuro"
## [535] "Pacific/Marquesas" "Pacific/Midway"
## [537] "Pacific/Nauru" "Pacific/Niue"
## [539] "Pacific/Norfolk" "Pacific/Noumea"
## [541] "Pacific/Pago_Pago" "Pacific/Palau"
## [543] "Pacific/Pitcairn" "Pacific/Pohnpei"
## [545] "Pacific/Ponape" "Pacific/Port_Moresby"
## [547] "Pacific/Rarotonga" "Pacific/Saipan"
## [549] "Pacific/Samoa" "Pacific/Tahiti"
## [551] "Pacific/Tarawa" "Pacific/Tongatapu"
## [553] "Pacific/Truk" "Pacific/Wake"
## [555] "Pacific/Wallis" "Pacific/Yap"
## [557] "Poland" "Portugal"
## [559] "PRC" "PST8PDT"
## [561] "ROC" "ROK"
## [563] "Singapore" "Turkey"
## [565] "UCT" "Universal"
## [567] "US/Alaska" "US/Aleutian"
## [569] "US/Arizona" "US/Central"
## [571] "US/East-Indiana" "US/Eastern"
## [573] "US/Hawaii" "US/Indiana-Starke"
## [575] "US/Michigan" "US/Mountain"
## [577] "US/Pacific" "US/Pacific-New"
## [579] "US/Samoa" "UTC"
## [581] "VERSION" "W-SU"
## [583] "WET" "Zulu"
# From ?strptime (excerpted)
#
# ** General formats **
# %c Date and time. Locale-specific on output, "%a %b %e %H:%M:%S %Y" on input.
# %F Equivalent to %Y-%m-%d (the ISO 8601 date format).
# %T Equivalent to %H:%M:%S.
# %D Date format such as %m/%d/%y: the C99 standard says it should be that exact format
# %x Date. Locale-specific on output, "%y/%m/%d" on input.
# %X Time. Locale-specific on output, "%H:%M:%S" on input.
#
# ** Key Components **
# %y Year without century (00-99). On input, values 00 to 68 are prefixed by 20 and 69 to 99 by 19
# %Y Year with century
# %m Month as decimal number (01-12).
# %b Abbreviated month name in the current locale on this platform.
# %B Full month name in the current locale.
# %d Day of the month as decimal number (01-31).
# %e Day of the month as decimal number (1-31), with a leading space for a single-digit number.
# %a Abbreviated weekday name in the current locale on this platform.
# %A Full weekday name in the current locale.
# %H Hours as decimal number (00-23)
# %I Hours as decimal number (01-12)
# %M Minute as decimal number (00-59).
# %S Second as integer (00-61), allowing for up to two leap-seconds (but POSIX-compliant implementations will ignore leap seconds).
#
# ** Additional Options **
# %C Century (00-99): the integer part of the year divided by 100.
#
# %g The last two digits of the week-based year (see %V). (Accepted but ignored on input.)
# %G The week-based year (see %V) as a decimal number. (Accepted but ignored on input.)
#
# %h Equivalent to %b.
#
# %j Day of year as decimal number (001-366).
#
# %n Newline on output, arbitrary whitespace on input.
#
# %p AM/PM indicator in the locale. Used in conjunction with %I and not with %H. An empty string in some locales (and the behaviour is undefined if used for input in such a locale). Some platforms accept %P for output, which uses a lower-case version: others will output P.
#
# %r The 12-hour clock time (using the locale's AM or PM). Only defined in some locales.
#
# %R Equivalent to %H:%M.
#
# %t Tab on output, arbitrary whitespace on input.
#
# %u Weekday as a decimal number (1-7, Monday is 1).
#
# %U Week of the year as decimal number (00-53) using Sunday as the first day 1 of the week (and typically with the first Sunday of the year as day 1 of week 1). The US convention.
#
# %V Week of the year as decimal number (01-53) as defined in ISO 8601. If the week (starting on Monday) containing 1 January has four or more days in the new year, then it is considered week 1. Otherwise, it is the last week of the previous year, and the next week is week 1. (Accepted but ignored on input.)
#
# %w Weekday as decimal number (0-6, Sunday is 0).
#
# %W Week of the year as decimal number (00-53) using Monday as the first day of week (and typically with the first Monday of the year as day 1 of week 1). The UK convention.
#
# For input, only years 0:9999 are accepted.
#
# %z Signed offset in hours and minutes from UTC, so -0800 is 8 hours behind UTC. Values up to +1400 are accepted as from R 3.1.1: previous versions only accepted up to +1200. (Standard only for output.)
#
# %Z (Output only.) Time zone abbreviation as a character string (empty if not available). This may not be reliable when a time zone has changed abbreviations over the years.
Hadley and Charlotte Wickham led a course on writing functions in R. Broadly, the course includes advice on when/how to use functions, as well as specific advice about commands available through library(purrr).
Key pieces of advice include:
John Chambers gave a few useful slogans about functions:
Each function has three components:
Only the LAST evaluated expression is returned. The use of return() is recommended only for early-returns in a special case (for example, when a break() will be called).
Further, functions can be written anonymously on the command line, such as (function (x) {x + 1}) (1:5). A function should only depend on arguments passed to it, not variables from a parent enviornment. Every time the function is called, it receives a clean working environment. Once it finishes, its variables are no longer available unless they were returned (either by default as the last operation, or by way of return()):
# Components of a function
args(rnorm)
## function (n, mean = 0, sd = 1)
## NULL
formals(rnorm)
## $n
##
##
## $mean
## [1] 0
##
## $sd
## [1] 1
body(rnorm)
## .Call(C_rnorm, n, mean, sd)
environment(rnorm)
## <environment: namespace:stats>
# What is passed back
funDummy <- function(x) {
if (x <= 2) {
print("That is too small")
return(3) # This ends the function by convention
}
ceiling(x) # This is the defaulted return() value if nothing happened to prevent the code getting here
}
funDummy(1)
## [1] "That is too small"
## [1] 3
funDummy(5)
## [1] 5
# Anonymous functions
(function (x) {x + 1}) (1:5)
## [1] 2 3 4 5 6
The course includes some insightful discussion of vectors. As it happens, lists and data frames are just special collections of vectors in R. Each column of a data frame is a vector, while each element of a list is either 1) an embedded data frame (which is eventually a vector by way of columns), 2) an embedded list (which is eventually a vector by way of recursion), or 3) an actual vector.
The atomic vectors are of types logical, integer, character, and double; complex and raw are rarer types that are also available. Lists are just recursive vectors, which is to say that lists can contain other lists and can be hetergeneous. To explore vectors, you have:
Note that NULL is the absence of a vector and has length 0. NA is the absence of an element in the vector and has length 1. All math operations with NA return NA; for example NA == NA will return NA.
There are some good tips on extracting element from a list:
# Data types
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
typeof(mtcars) # n.b. that this is technically a "list"
## [1] "list"
length(mtcars)
## [1] 11
# NULL and NA
length(NULL)
## [1] 0
typeof(NULL)
## [1] "NULL"
length(NA)
## [1] 1
typeof(NA)
## [1] "logical"
NULL == NULL
## logical(0)
NULL == NA
## logical(0)
NA == NA
## [1] NA
is.null(NULL)
## [1] TRUE
is.null(NA)
## [1] FALSE
is.na(NULL)
## Warning in is.na(NULL): is.na() applied to non-(list or vector) of type
## 'NULL'
## logical(0)
is.na(NA)
## [1] TRUE
# Extraction
mtcars[["mpg"]][1:5]
## [1] 21.0 21.0 22.8 21.4 18.7
mtcars[[2]][1:5]
## [1] 6 6 4 6 8
mtcars$hp[1:5]
## [1] 110 110 93 110 175
# Relevant lengths
seq_along(mtcars)
## [1] 1 2 3 4 5 6 7 8 9 10 11
x <- data.frame()
seq_along(x)
## integer(0)
length(seq_along(x))
## [1] 0
foo <- function(x) { for (eachCol in seq_along(x)) { print(typeof(x[[eachCol]])) }}
foo(mtcars)
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
foo(x) # Note that this does nothing!
data(airquality)
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
foo(airquality)
## [1] "integer"
## [1] "integer"
## [1] "double"
## [1] "integer"
## [1] "integer"
## [1] "integer"
# Range command
mpgRange <- range(mtcars$mpg)
mpgRange
## [1] 10.4 33.9
mpgScale <- (mtcars$mpg - mpgRange[1]) / (mpgRange[2] - mpgRange[1])
summary(mpgScale)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2138 0.3745 0.4124 0.5277 1.0000
The typical arguments in a function use a consistent, simple naming function:
Data arguments should come before detail arguments, and detail arguments should be given reasonable default values. See for example rnorm(n, mean=0, sd=1). The number requested (n) must be specified, but defaults are available for the details (mean and standard deviation).
Functions can be passed as arguments to other functions, which is at the core of functional programming. For example:
do_math <- function(x, fun) { fun(x) }
do_math(1:10, fun=mean)
## [1] 5.5
do_math(1:10, fun=sd)
## [1] 3.02765
The library(purrr) takes advantage of this, and in a type-consistent manner. There are functions for:
The general arguments are .x (a list or an atomic vector) and .f which can be either a function, an anonymous function (formula with ~), or an extractor .x[[.f]]. For example:
library(purrr)
## Warning: package 'purrr' was built under R version 3.2.5
library(RColorBrewer) # Need to have in non-cached chunk for later
data(mtcars)
# Create output as a list
map(.x=mtcars, .f=sum)
## $mpg
## [1] 642.9
##
## $cyl
## [1] 198
##
## $disp
## [1] 7383.1
##
## $hp
## [1] 4694
##
## $drat
## [1] 115.09
##
## $wt
## [1] 102.952
##
## $qsec
## [1] 571.16
##
## $vs
## [1] 14
##
## $am
## [1] 13
##
## $gear
## [1] 118
##
## $carb
## [1] 90
# Create same output as a double
map_dbl(.x=mtcars, .f=sum)
## mpg cyl disp hp drat wt qsec vs
## 642.900 198.000 7383.100 4694.000 115.090 102.952 571.160 14.000
## am gear carb
## 13.000 118.000 90.000
# Create same output as integer
# map_int(.x=mtcars, .f=sum) . . . this would bomb since it is not actually an integere
map_int(.x=mtcars, .f=function(x) { as.integer(round(sum(x), 0)) } )
## mpg cyl disp hp drat wt qsec vs am gear carb
## 643 198 7383 4694 115 103 571 14 13 118 90
# Same thing but using an anonymous function with ~ and .
map_int(.x=mtcars, .f = ~ as.integer(round(sum(.), 0)) )
## mpg cyl disp hp drat wt qsec vs am gear carb
## 643 198 7383 4694 115 103 571 14 13 118 90
# Create a boolean vector
map_lgl(.x=mtcars, .f = ~ ifelse(sum(.) > 200, TRUE, FALSE) )
## mpg cyl disp hp drat wt qsec vs am gear carb
## TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
# Create a character vector
map_chr(.x=mtcars, .f = ~ ifelse(sum(.) > 200, "Large", "Not So Large") )
## mpg cyl disp hp drat
## "Large" "Not So Large" "Large" "Large" "Not So Large"
## wt qsec vs am gear
## "Not So Large" "Large" "Not So Large" "Not So Large" "Not So Large"
## carb
## "Not So Large"
# Use the extractor [pulls the first row]
map_dbl(.x=mtcars, .f=1)
## mpg cyl disp hp drat wt qsec vs am gear
## 21.00 6.00 160.00 110.00 3.90 2.62 16.46 0.00 1.00 4.00
## carb
## 4.00
# Example from help file using chaining
mtcars %>%
split(.$cyl) %>%
map(~ lm(mpg ~ wt, data = .x)) %>%
map(summary) %>%
map_dbl("r.squared")
## 4 6 8
## 0.5086326 0.4645102 0.4229655
# Using sapply
sapply(split(mtcars, mtcars$cyl), FUN=function(.x) { summary(lm(mpg ~ wt, data=.x))$r.squared } )
## 4 6 8
## 0.5086326 0.4645102 0.4229655
# Use the extractor from a list
cylSplit <- split(mtcars, mtcars$cyl)
map(cylSplit, "mpg")
## $`4`
## [1] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26.0 30.4 21.4
##
## $`6`
## [1] 21.0 21.0 21.4 18.1 19.2 17.8 19.7
##
## $`8`
## [1] 18.7 14.3 16.4 17.3 15.2 10.4 10.4 14.7 15.5 15.2 13.3 19.2 15.8 15.0
map(cylSplit, "cyl")
## $`4`
## [1] 4 4 4 4 4 4 4 4 4 4 4
##
## $`6`
## [1] 6 6 6 6 6 6 6
##
## $`8`
## [1] 8 8 8 8 8 8 8 8 8 8 8 8 8 8
The purrr library has several additional interesting functions:
Some example code includes:
library(purrr) # Called again for clarity; all these key functions belong to purrr
# safely(.f, otherwise = NULL, quiet = TRUE)
safe_log10 <- safely(log10)
map(list(0, 1, 10, "a"), .f=safe_log10)
## [[1]]
## [[1]]$result
## [1] -Inf
##
## [[1]]$error
## NULL
##
##
## [[2]]
## [[2]]$result
## [1] 0
##
## [[2]]$error
## NULL
##
##
## [[3]]
## [[3]]$result
## [1] 1
##
## [[3]]$error
## NULL
##
##
## [[4]]
## [[4]]$result
## NULL
##
## [[4]]$error
## <simpleError in .f(...): non-numeric argument to mathematical function>
# possibly(.f, otherwise, quiet = TRUE)
poss_log10 <- possibly(log10, otherwise=NaN)
map_dbl(list(0, 1, 10, "a"), .f=poss_log10)
## [1] -Inf 0 1 NaN
# transpose() - note that this can become masked by data.table::transpose() so be careful
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))
## $result
## $result[[1]]
## [1] -Inf
##
## $result[[2]]
## [1] 0
##
## $result[[3]]
## [1] 1
##
## $result[[4]]
## NULL
##
##
## $error
## $error[[1]]
## NULL
##
## $error[[2]]
## NULL
##
## $error[[3]]
## NULL
##
## $error[[4]]
## <simpleError in .f(...): non-numeric argument to mathematical function>
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$result
## [[1]]
## [1] -Inf
##
## [[2]]
## [1] 0
##
## [[3]]
## [1] 1
##
## [[4]]
## NULL
unlist(purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$result)
## [1] -Inf 0 1
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$error
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## <simpleError in .f(...): non-numeric argument to mathematical function>
map_lgl(purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$error, is.null)
## [1] TRUE TRUE TRUE FALSE
# map2(.x, .y, .f)
map2(list(5, 10, 20), list(1, 2, 3), .f=rnorm) # rnorm(5, 1), rnorm(10, 2), and rnorm(20, 3)
## [[1]]
## [1] 2.3287206 2.6164312 0.4091215 0.8729018 2.3230746
##
## [[2]]
## [1] 2.042111 3.005090 2.119607 2.805102 1.391124 1.581452 1.661046
## [8] 1.867352 2.534756 1.010977
##
## [[3]]
## [1] 3.284111 4.600825 2.738266 2.979224 3.445683 3.936116 3.809101
## [8] 3.321820 2.870224 3.352113 1.959291 1.343982 2.408308 4.217730
## [15] 5.288376 2.269969 2.057404 2.732965 4.040102 3.758804
# pmap(.l, .f)
pmap(list(n=list(5, 10, 20), mean=list(1, 5, 10), sd=list(0.1, 0.5, 0.1)), rnorm)
## [[1]]
## [1] 0.9411217 1.0023844 1.0140966 1.0188091 0.8655009
##
## [[2]]
## [1] 5.114076 5.809002 4.705152 5.381185 4.880197 5.771555 4.501960
## [8] 4.785909 4.864858 5.263266
##
## [[3]]
## [1] 9.984199 9.893967 9.842237 10.105544 10.054912 9.922942 9.924452
## [8] 10.067071 10.017669 9.950732 9.953487 9.916090 10.026484 10.170153
## [15] 10.125430 9.921894 10.072730 10.104990 10.043501 10.086416
# invoke_map(.f, .x, ...)
invoke_map(list(rnorm, runif, rexp), n=5)
## [[1]]
## [1] 3.2271821 0.9257784 -1.1470560 0.9179828 1.1532561
##
## [[2]]
## [1] 0.98626797 0.01417451 0.29363518 0.50323053 0.01672044
##
## [[3]]
## [1] 0.3227441 1.0964695 2.3726571 1.1701611 0.2753049
# walk() is for the side effects of a function
x <- list(1, "\n\ta\n", 3)
x %>% walk(cat)
## 1
## a
## 3
# Chaining is available by way of the %>% operator
pretty_titles <- c("N(0, 1)", "Uniform(0, 1)", "Exponential (rate=1)")
set.seed(1607120947)
x <- invoke_map(list(rnorm, runif, rexp), n=5000)
foo <- function(x) { map(x, .f=summary) }
par(mfrow=c(1, 3))
pwalk(list(x=x, main=pretty_titles), .f=hist, xlab="", col="light blue") %>% map(.f=foo)
## $x
## $x[[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.711000 -0.637800 -0.000217 0.006543 0.671800 3.633000
##
## $x[[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0001241 0.2518000 0.5012000 0.5028000 0.7566000 0.9999000
##
## $x[[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00001 0.29140 0.68340 0.98260 1.37900 8.46300
##
##
## $main
## $main[[1]]
## Length Class Mode
## 1 character character
##
## $main[[2]]
## Length Class Mode
## 1 character character
##
## $main[[3]]
## Length Class Mode
## 1 character character
par(mfrow=c(1, 1))
There are two potentially desirable behaviors with functions:
As a best practice, R functions that will be used for programming (as opposed to interactive command line work) should be written in a robust manner. Three standard problems should be avoided/mitigated:
There are several methods available for throwing errors within an R function:
One example that commonly creates surprises is the [,] operator for extraction. Adding [ , , drop=FALSE] ensures that you will still have what you passed (e.g., a matrix or data frame) rather than conversion of achunk of data to a vector.
Another common source of error is sapply() which will return a vector when it can and a list otherwise. The map() and map_typ() functions in purrr are designed to be type-stable; if the output is not as expected, they will error out.
Non-standard evaluations take advantage of the existence of something else (e.g., a variable in the parent environment that has not been passed). This can cause confusion and improper results.
Pure functions have the key properties that 1) their output depends only on their inputs, and 2) they do not impact the outside world other than by way of their return value. Specifically, the function should not depend on how the user has configured their global options as shown in options(), nor should it modify those options() settings upon return of control to the parent environment.
A few examples are shown below:
# Throwing errors to stop a function (cannot actually run these!)
# stopifnot(FALSE)
# if (FALSE) { stop("Error: ", call.=FALSE) }
# if (FALSE) { stop("Error: This condition needed to be set as TRUE", call.=FALSE) }
# Behavior of [,] and [,,drop=FALSE]
mtxTest <- matrix(data=1:9, nrow=3, byrow=TRUE)
class(mtxTest)
## [1] "matrix"
mtxTest[1, ]
## [1] 1 2 3
class(mtxTest[1, ])
## [1] "integer"
mtxTest[1, , drop=FALSE]
## [,1] [,2] [,3]
## [1,] 1 2 3
class(mtxTest[1, , drop=FALSE])
## [1] "matrix"
# Behavior of sapply() - may not get what you are expecting
foo <- function(x) { x^2 }
sapply(1:5, FUN=foo)
## [1] 1 4 9 16 25
class(sapply(1:5, FUN=foo))
## [1] "numeric"
sapply(c(1, list(1.5, 2, 2.5), 3, 4, 5), FUN=foo)
## [1] 1.00 2.25 4.00 6.25 9.00 16.00 25.00
class(sapply(c(1, list(1.5, 2, 2.5), 3, 4, 5), FUN=foo))
## [1] "numeric"
sapply(list(1, c(1.5, 2, 2.5), 3, 4, 5), FUN=foo)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 2.25 4.00 6.25
##
## [[3]]
## [1] 9
##
## [[4]]
## [1] 16
##
## [[5]]
## [1] 25
class(sapply(list(1, c(1.5, 2, 2.5), 3, 4, 5), FUN=foo))
## [1] "list"
This was a very enjoyable and instructive course.
This course provides an overview of loading data in to R from five main sources:
At the most basic level, the utlis library easily handles reading most types of flat files:
There are also European equivalents in case the decimal needs to be set as “,” to read in the file:
The file.path() command is a nice way to put together file paths. It is more or less equivalent to paste(, sep=“/”), but with the benefit that sep is machine/operating-system dependent, so it may be easier to use across platforms.
Further, there is the option to use colClasses() to specify the type in each column, with NULL meaning do not import. Abbreviations can be used for these as well:
# colClasses (relevant abbreviations)
R.utils::colClasses("-?cdfilnrzDP")
## [1] "NULL" "NA" "character" "double" "factor"
## [6] "integer" "logical" "numeric" "raw" "complex"
## [11] "Date" "POSIXct"
# file.path example
file.path("..", "myplot.pdf")
## [1] "../myplot.pdf"
# Key documentation for reading flat files
#
# read.table(file, header = FALSE, sep = "", quote = "\"'",
# dec = ".", numerals = c("allow.loss", "warn.loss", "no.loss"),
# row.names, col.names, as.is = !stringsAsFactors,
# na.strings = "NA", colClasses = NA, nrows = -1,
# skip = 0, check.names = TRUE, fill = !blank.lines.skip,
# strip.white = FALSE, blank.lines.skip = TRUE,
# comment.char = "#",
# allowEscapes = FALSE, flush = FALSE,
# stringsAsFactors = default.stringsAsFactors(),
# fileEncoding = "", encoding = "unknown", text, skipNul = FALSE)
#
# read.csv(file, header = TRUE, sep = ",", quote = "\"",
# dec = ".", fill = TRUE, comment.char = "", ...)
#
# read.csv2(file, header = TRUE, sep = ";", quote = "\"",
# dec = ",", fill = TRUE, comment.char = "", ...)
#
# read.delim(file, header = TRUE, sep = "\t", quote = "\"",
# dec = ".", fill = TRUE, comment.char = "", ...)
#
# read.delim2(file, header = TRUE, sep = "\t", quote = "\"",
# dec = ",", fill = TRUE, comment.char = "", ...)
There are also two libraries that can be especially helpful for reading in flat files - readr and data.table.
read_tsv is for tab-separated values
Further, the library(readxl) is handy for loading Excel sheets:
R can also load files from common statistical software such as SAS, STATA, SPSS, and MATLAB/Octave. The packages haven() by Wickham and foreign() by the R core team are two common examples. The R.matlab() allows for reading to/from MATLAB/Octave:
The library(haven) contains wrappers to the ReadStat package, a C library by Evan Miller, for reading files from SAS, STATA, and SPSS:
The library(foreign) can read/write all types of foreign formats, with some caveats:
Finally, the R.matlab() library is available for reading/writing MATLAB/Octave files. Per the help file:
Methods readMat() and writeMat() for reading and writing MAT files. For user with MATLAB v6 or newer installed (either locally or on a remote host), the package also provides methods for controlling MATLAB (trademark) via R and sending and retrieving data between R and MATLAB.
In brief, this package provides a one-directional interface from R to MATLAB, with communication taking place via a TCP/IP connection and with data transferred either through another connection or via the file system. On the MATLAB side, the TCP/IP connection is handled by a small Java add-on.
The methods for reading and writing MAT files are stable. The R to MATLAB interface, that is the Matlab class, is less prioritized and should be considered a beta version.
Relational databases in R (DBMS tend to use SQL for queries), including libraries:
Conventions are specified in DBI; see library(DBI):
Create the connection as “con” (or whatever) and then use that elsewhere:
When finished, dbDisconnect(con) as a courtesy so as to not tie up resources.
SQL queries from inside R - per previous, library(DBI) and then create the connection “con”:
For example, using “./SQLforDataCampRMD_v01.db”, run a few SQL commands:
# uses libraries DBI for the connection and RSQLite to interface with SQLite Browser on my machine
con <- DBI::dbConnect(RSQLite::SQLite(), "SQLforDataCampRMD_v01.db")
# List the tables, and drop dummy if it already exists
DBI::dbListTables(con)
## [1] "dummy"
DBI::dbGetQuery(con, "DROP TABLE IF EXISTS dummy")
# Create blank table
DBI::dbListTables(con)
## character(0)
DBI::dbGetQuery(con, "CREATE TABLE IF NOT EXISTS dummy (id PRIMARY KEY, name CHAR)")
DBI::dbGetQuery(con, "INSERT OR IGNORE INTO dummy (id, name) VALUES (1, 'Amy')")
DBI::dbGetQuery(con, "INSERT OR IGNORE INTO dummy (id, name) VALUES (2, 'Bill')")
DBI::dbGetQuery(con, "INSERT OR IGNORE INTO dummy (id, name) VALUES (2, 'Jen')") # Should do nothing
DBI::dbGetQuery(con, "SELECT * FROM dummy")
## id name
## 1 1 Amy
## 2 2 Bill
DBI::dbListTables(con)
## [1] "dummy"
# Can continue passing SQL commands back and forth as needed
# Close the connection
DBI::dbDisconnect(con)
## [1] TRUE
Many of the R read-in libraries already work well with web data. For example, read.csv(“mywebsite.com”, stringAsFactors=FALSE) will read a CSV right off the internet. Further, there are options for:
The jsonlite library is good for working with JSON:
Prettify adds indentation to a JSON string; minify removes all indentation/whitespace:
jsonLoc <- file.path("../../..", "PythonDirectory", "UMModule04", "roster_data.json")
jsonData <- jsonlite::fromJSON(jsonLoc)
str(jsonData)
## chr [1:379, 1:3] "Calvin" "Wilson" "Emi" "Rosina" "Sylvie" ...
head(jsonData)
## [,1] [,2] [,3]
## [1,] "Calvin" "si110" "1"
## [2,] "Wilson" "si110" "0"
## [3,] "Emi" "si110" "0"
## [4,] "Rosina" "si110" "0"
## [5,] "Sylvie" "si110" "0"
## [6,] "Katarzyna" "si110" "0"
The general analysis pipeline is Collect -> Clean -> Analyze -> Report. Cleaning is needed so the raw data can work with more traditional tools (e.g., packages in Python or R). 50% - 80% of time is spent in the Collect/Clean realm, even though this is not the most exciting (and thus taught) part of data analysis. There are generally three stages of data cleaning: Explore -> Tidy -> Prepare
Exploring the Data:
Viewing the Data:
Tidy data - Wickham 2014, Principles of Tidy Data:
The principles of tidy data can be implemented using library(tidyr):
Common symptoms of messy data include:
Example code includes:
# tidyr::gather()
stocks <- data.frame(time = as.Date('2009-01-01') + 0:4,
X = rnorm(5, 0, 1), Y = rnorm(5, 0, 2), Z = rnorm(5, 0, 4)
)
stocks
## time X Y Z
## 1 2009-01-01 1.64736472 -0.1020457 -8.074672
## 2 2009-01-02 0.32981671 -0.2377234 7.617473
## 3 2009-01-03 0.05010405 -0.7091054 -9.770047
## 4 2009-01-04 0.41187479 1.1899260 -1.655071
## 5 2009-01-05 -2.20625659 -1.1299452 1.615068
# will create new columns stock (each of X, Y, Z) and price (the values that had been in X, Y, and Z),
# while not gathering the time variable; final table will be time-stock-price
stockGather <- tidyr::gather(stocks, stock, price, -time)
stockGather
## time stock price
## 1 2009-01-01 X 1.64736472
## 2 2009-01-02 X 0.32981671
## 3 2009-01-03 X 0.05010405
## 4 2009-01-04 X 0.41187479
## 5 2009-01-05 X -2.20625659
## 6 2009-01-01 Y -0.10204566
## 7 2009-01-02 Y -0.23772338
## 8 2009-01-03 Y -0.70910541
## 9 2009-01-04 Y 1.18992602
## 10 2009-01-05 Y -1.12994523
## 11 2009-01-01 Z -8.07467238
## 12 2009-01-02 Z 7.61747283
## 13 2009-01-03 Z -9.77004663
## 14 2009-01-04 Z -1.65507149
## 15 2009-01-05 Z 1.61506772
# tidyr::spread()
tidyr::spread(stockGather, stock, price)
## time X Y Z
## 1 2009-01-01 1.64736472 -0.1020457 -8.074672
## 2 2009-01-02 0.32981671 -0.2377234 7.617473
## 3 2009-01-03 0.05010405 -0.7091054 -9.770047
## 4 2009-01-04 0.41187479 1.1899260 -1.655071
## 5 2009-01-05 -2.20625659 -1.1299452 1.615068
# TRUE (this fully reverses what the gather function has done)
identical(tidyr::spread(stockGather, stock, price), stocks)
## [1] TRUE
# tidyr::separate()
df <- data.frame(x = c(NA, "a.b", "a.d", "b.c"))
df
## x
## 1 <NA>
## 2 a.b
## 3 a.d
## 4 b.c
# by default, the splits occur on anything that is not alphanumeric,
# so you get column A as whatever is before the dot and column B as whatever is after the dot
dfSep <- tidyr::separate(df, x, c("A", "B"))
dfSep
## A B
## 1 <NA> <NA>
## 2 a b
## 3 a d
## 4 b c
# tidyr::unite()
tidyr::unite(dfSep, united, c(A, B), sep="")
## united
## 1 NANA
## 2 ab
## 3 ad
## 4 bc
is.na(dfSep) # caution . . .
## A B
## 1 TRUE TRUE
## 2 FALSE FALSE
## 3 FALSE FALSE
## 4 FALSE FALSE
is.na(tidyr::unite(dfSep, united, c(A, B), sep="")) # caution . . .
## united
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
The tolower() and toupper() commands can be very useful also
Example code includes:
# lubridate::ymd()
lubridate::ymd("160720")
## [1] "2016-07-20 UTC"
lubridate::ymd("2016-7-20")
## [1] "2016-07-20 UTC"
lubridate::ymd("16jul20")
## [1] "2016-07-20 UTC"
lubridate::ymd("16/07/20")
## [1] "2016-07-20 UTC"
# lubridate::hms()
lubridate::hms("07h15:00")
## [1] "7H 15M 0S"
lubridate::hms("17 hours, 15 minutes 00 seconds")
## [1] "17H 15M 0S"
lubridate::hms("07-15-00")
## [1] "7H 15M 0S"
# From ?stringr::str_detect
#
# str_detect(string, pattern)
# string Input vector. Either a character vector, or something coercible to one.
# pattern Pattern to look for. The default interpretation is a regular expression, as described in stringi-search-regex. Control options with regex(). Match a fixed string (i.e. by comparing only bytes), using fixed(x). This is fast, but approximate. Generally, for matching human text, you'll want coll(x) which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with boundary(). An empty pattern, "", is equivalent to boundary("character").
#
fruit <- c("apple", "banana", "pear", "pinapple")
stringr::str_detect(fruit, "a")
## [1] TRUE TRUE TRUE TRUE
stringr::str_detect(fruit, "^a")
## [1] TRUE FALSE FALSE FALSE
stringr::str_detect(fruit, "a$")
## [1] FALSE TRUE FALSE FALSE
stringr::str_detect(fruit, "b")
## [1] FALSE TRUE FALSE FALSE
stringr::str_detect(fruit, "[aeiou]")
## [1] TRUE TRUE TRUE TRUE
# Also vectorised over pattern
stringr::str_detect("aecfg", letters)
## [1] TRUE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE
# From ?stringr::str_replace
#
# str_replace(string, pattern, replacement)
# str_replace_all(string, pattern, replacement)
# string Input vector. Either a character vector, or something coercible to one.
# pattern, replacement Supply separate pattern and replacement strings to vectorise over the patterns. References of the form \1, \2 will be replaced with the contents of the respective matched group (created by ()) within the pattern. For str_replace_all only, you can perform multiple patterns and replacements to each string, by passing a named character to pattern.
#
someNA <- c(letters, "", LETTERS, "")
someNA[someNA==""] <- NA
someNA
## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z" NA "A" "B" "C" "D" "E" "F" "G"
## [35] "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S" "T" "U" "V" "W" "X"
## [52] "Y" "Z" NA
fruits <- c("one apple", "two pears", "three bananas")
stringr::str_replace(fruits, "[aeiou]", "-") # Replace FIRST instance
## [1] "-ne apple" "tw- pears" "thr-e bananas"
stringr::str_replace_all(fruits, "[aeiou]", "-") # Replace ALL instances
## [1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"
stringr::str_replace(fruits, "([aeiou])", "\\1\\1\\1") # Triple up on the first vowel
## [1] "ooone apple" "twooo pears" "threeee bananas"
stringr::str_replace(fruits, "[aeiou]", c("1", "2", "3")) # First vowel to 1, 2, 3 in word 1, 2, 3
## [1] "1ne apple" "tw2 pears" "thr3e bananas"
stringr::str_replace(fruits, c("a", "e", "i"), "-") # First a -> - in word 1, first e -> - in word 2 . . .
## [1] "one -pple" "two p-ars" "three bananas"
stringr::str_replace_all(fruits, "([aeiou])", "\\1\\1") # Double up on all vowels
## [1] "oonee aapplee" "twoo peeaars" "threeee baanaanaas"
stringr::str_replace_all(fruits, "[aeiou]", c("1", "2", "3")) # All vowels to 1, 2, 3, in word 1, 2, 3
## [1] "1n1 1ppl1" "tw2 p22rs" "thr33 b3n3n3s"
stringr::str_replace_all(fruits, c("a", "e", "i"), "-") # All a -> - in word 1, . . .
## [1] "one -pple" "two p-ars" "three bananas"
Further, the outline from the weather gathering data cleaning challenge is noted:
The library(dplyr) is a grammar of data manipulation. It is written in C++ so you get the speed of C with the convenience of R. It is in essence the data frame to data frame portion of plyr (plyr was the original Split-Apply-Combine). May want to look in to count, transmute, and other verbs added post this summary.
The examples use data(hflights) from library(hflights):
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:purrr':
##
## order_by
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(hflights)
data(hflights)
head(hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
summary(hflights)
## Year Month DayofMonth DayOfWeek
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000
## Median :2011 Median : 7.000 Median :16.00 Median :4.000
## Mean :2011 Mean : 6.514 Mean :15.74 Mean :3.948
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000
##
## DepTime ArrTime UniqueCarrier FlightNum
## Min. : 1 Min. : 1 Length:227496 Min. : 1
## 1st Qu.:1021 1st Qu.:1215 Class :character 1st Qu.: 855
## Median :1416 Median :1617 Mode :character Median :1696
## Mean :1396 Mean :1578 Mean :1962
## 3rd Qu.:1801 3rd Qu.:1953 3rd Qu.:2755
## Max. :2400 Max. :2400 Max. :7290
## NA's :2905 NA's :3066
## TailNum ActualElapsedTime AirTime ArrDelay
## Length:227496 Min. : 34.0 Min. : 11.0 Min. :-70.000
## Class :character 1st Qu.: 77.0 1st Qu.: 58.0 1st Qu.: -8.000
## Mode :character Median :128.0 Median :107.0 Median : 0.000
## Mean :129.3 Mean :108.1 Mean : 7.094
## 3rd Qu.:165.0 3rd Qu.:141.0 3rd Qu.: 11.000
## Max. :575.0 Max. :549.0 Max. :978.000
## NA's :3622 NA's :3622 NA's :3622
## DepDelay Origin Dest Distance
## Min. :-33.000 Length:227496 Length:227496 Min. : 79.0
## 1st Qu.: -3.000 Class :character Class :character 1st Qu.: 376.0
## Median : 0.000 Mode :character Mode :character Median : 809.0
## Mean : 9.445 Mean : 787.8
## 3rd Qu.: 9.000 3rd Qu.:1042.0
## Max. :981.000 Max. :3904.0
## NA's :2905
## TaxiIn TaxiOut Cancelled CancellationCode
## Min. : 1.000 Min. : 1.00 Min. :0.00000 Length:227496
## 1st Qu.: 4.000 1st Qu.: 10.00 1st Qu.:0.00000 Class :character
## Median : 5.000 Median : 14.00 Median :0.00000 Mode :character
## Mean : 6.099 Mean : 15.09 Mean :0.01307
## 3rd Qu.: 7.000 3rd Qu.: 18.00 3rd Qu.:0.00000
## Max. :165.000 Max. :163.00 Max. :1.00000
## NA's :3066 NA's :2947
## Diverted
## Min. :0.000000
## 1st Qu.:0.000000
## Median :0.000000
## Mean :0.002853
## 3rd Qu.:0.000000
## Max. :1.000000
##
The “tbl” is a special type of data frame, which is very helpful for printing:
An interesting way to do a lookup table:
See for example:
lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental",
"DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways",
"WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier",
"FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa"
)
hflights$Carrier <- lut[hflights$UniqueCarrier]
glimpse(hflights)
## Observations: 227,496
## Variables: 22
## $ Year (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek (int) 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime (int) 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime (int) 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier (chr) "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum (int) 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime (int) 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay (int) -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay (int) 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance (int) 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn (int) 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut (int) 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode (chr) "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Carrier (chr) "American", "American", "American", "America...
There are five main verbs in dplyr:
There is also the group_by capability for summaries of sub-groups:
The dplyr library can also work with databases. It only loads the data that you need, and you do not need to know the relevant SQL code – dplyr writes the SQL code for you.
Basic select and mutate examples include:
data(hflights)
# Make it faster, as well as a prettier printer
hflights <- tbl_df(hflights)
hflights
## Source: local data frame [227,496 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
class(hflights)
## [1] "tbl_df" "tbl" "data.frame"
# Select examples
select(hflights, ActualElapsedTime, AirTime, ArrDelay, DepDelay)
## Source: local data frame [227,496 x 4]
##
## ActualElapsedTime AirTime ArrDelay DepDelay
## (int) (int) (int) (int)
## 1 60 40 -10 0
## 2 60 45 -9 1
## 3 70 48 -8 -8
## 4 70 39 3 3
## 5 62 44 -3 5
## 6 64 45 -7 -1
## 7 70 43 -1 -1
## 8 59 40 -16 -5
## 9 71 41 44 43
## 10 70 45 43 43
## .. ... ... ... ...
select(hflights, Origin:Cancelled)
## Source: local data frame [227,496 x 6]
##
## Origin Dest Distance TaxiIn TaxiOut Cancelled
## (chr) (chr) (int) (int) (int) (int)
## 1 IAH DFW 224 7 13 0
## 2 IAH DFW 224 6 9 0
## 3 IAH DFW 224 5 17 0
## 4 IAH DFW 224 9 22 0
## 5 IAH DFW 224 9 9 0
## 6 IAH DFW 224 6 13 0
## 7 IAH DFW 224 12 15 0
## 8 IAH DFW 224 7 12 0
## 9 IAH DFW 224 8 22 0
## 10 IAH DFW 224 6 19 0
## .. ... ... ... ... ... ...
select(hflights, Year:DayOfWeek, ArrDelay:Diverted)
## Source: local data frame [227,496 x 14]
##
## Year Month DayofMonth DayOfWeek ArrDelay DepDelay Origin Dest
## (int) (int) (int) (int) (int) (int) (chr) (chr)
## 1 2011 1 1 6 -10 0 IAH DFW
## 2 2011 1 2 7 -9 1 IAH DFW
## 3 2011 1 3 1 -8 -8 IAH DFW
## 4 2011 1 4 2 3 3 IAH DFW
## 5 2011 1 5 3 -3 5 IAH DFW
## 6 2011 1 6 4 -7 -1 IAH DFW
## 7 2011 1 7 5 -1 -1 IAH DFW
## 8 2011 1 8 6 -16 -5 IAH DFW
## 9 2011 1 9 7 44 43 IAH DFW
## 10 2011 1 10 1 43 43 IAH DFW
## .. ... ... ... ... ... ... ... ...
## Variables not shown: Distance (int), TaxiIn (int), TaxiOut (int),
## Cancelled (int), CancellationCode (chr), Diverted (int)
select(hflights, ends_with("Delay"))
## Source: local data frame [227,496 x 2]
##
## ArrDelay DepDelay
## (int) (int)
## 1 -10 0
## 2 -9 1
## 3 -8 -8
## 4 3 3
## 5 -3 5
## 6 -7 -1
## 7 -1 -1
## 8 -16 -5
## 9 44 43
## 10 43 43
## .. ... ...
select(hflights, UniqueCarrier, ends_with("Num"), starts_with("Cancel"))
## Source: local data frame [227,496 x 5]
##
## UniqueCarrier FlightNum TailNum Cancelled CancellationCode
## (chr) (int) (chr) (int) (chr)
## 1 AA 428 N576AA 0
## 2 AA 428 N557AA 0
## 3 AA 428 N541AA 0
## 4 AA 428 N403AA 0
## 5 AA 428 N492AA 0
## 6 AA 428 N262AA 0
## 7 AA 428 N493AA 0
## 8 AA 428 N477AA 0
## 9 AA 428 N476AA 0
## 10 AA 428 N504AA 0
## .. ... ... ... ... ...
select(hflights, ends_with("Time"), ends_with("Delay"))
## Source: local data frame [227,496 x 6]
##
## DepTime ArrTime ActualElapsedTime AirTime ArrDelay DepDelay
## (int) (int) (int) (int) (int) (int)
## 1 1400 1500 60 40 -10 0
## 2 1401 1501 60 45 -9 1
## 3 1352 1502 70 48 -8 -8
## 4 1403 1513 70 39 3 3
## 5 1405 1507 62 44 -3 5
## 6 1359 1503 64 45 -7 -1
## 7 1359 1509 70 43 -1 -1
## 8 1355 1454 59 40 -16 -5
## 9 1443 1554 71 41 44 43
## 10 1443 1553 70 45 43 43
## .. ... ... ... ... ... ...
# Mutate example
m1 <- mutate(hflights, loss = ArrDelay - DepDelay, loss_ratio = loss / DepDelay)
class(m1)
## [1] "tbl_df" "tbl" "data.frame"
m1
## Source: local data frame [227,496 x 23]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int), loss (int), loss_ratio (dbl)
glimpse(m1)
## Observations: 227,496
## Variables: 23
## $ Year (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek (int) 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime (int) 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime (int) 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier (chr) "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum (int) 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime (int) 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay (int) -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay (int) 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance (int) 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn (int) 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut (int) 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode (chr) "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ loss (int) -10, -10, 0, 0, -8, -6, 0, -11, 1, 0, 0, -14...
## $ loss_ratio (dbl) -Inf, -10.00000000, 0.00000000, 0.00000000, ...
Additionally, examples for filter and arrange:
# Examples for filter
filter(hflights, Distance >= 3000) # All flights that traveled 3000 miles or more
## Source: local data frame [527 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 31 1 924 1413 CO
## 2 2011 1 30 7 925 1410 CO
## 3 2011 1 29 6 1045 1445 CO
## 4 2011 1 28 5 1516 1916 CO
## 5 2011 1 27 4 950 1344 CO
## 6 2011 1 26 3 944 1350 CO
## 7 2011 1 25 2 924 1337 CO
## 8 2011 1 24 1 1144 1605 CO
## 9 2011 1 23 7 926 1335 CO
## 10 2011 1 22 6 942 1340 CO
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
filter(hflights, UniqueCarrier %in% c("B6", "WN", "DL"))
## Source: local data frame [48,679 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 654 1124 B6
## 2 2011 1 1 6 1639 2110 B6
## 3 2011 1 2 7 703 1113 B6
## 4 2011 1 2 7 1604 2040 B6
## 5 2011 1 3 1 659 1100 B6
## 6 2011 1 3 1 1801 2200 B6
## 7 2011 1 4 2 654 1103 B6
## 8 2011 1 4 2 1608 2034 B6
## 9 2011 1 5 3 700 1103 B6
## 10 2011 1 5 3 1544 1954 B6
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
filter(hflights, (TaxiIn + TaxiOut) > AirTime) # Flights where taxiing took longer than flying
## Source: local data frame [1,389 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 24 1 731 904 AA
## 2 2011 1 30 7 1959 2132 AA
## 3 2011 1 24 1 1621 1749 AA
## 4 2011 1 10 1 941 1113 AA
## 5 2011 1 31 1 1301 1356 CO
## 6 2011 1 31 1 2113 2215 CO
## 7 2011 1 31 1 1434 1539 CO
## 8 2011 1 31 1 900 1006 CO
## 9 2011 1 30 7 1304 1408 CO
## 10 2011 1 30 7 2004 2128 CO
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
filter(hflights, DepTime < 500 | ArrTime > 2200) # Flights departed before 5am or arrived after 10pm
## Source: local data frame [27,799 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 4 2 2100 2207 AA
## 2 2011 1 14 5 2119 2229 AA
## 3 2011 1 10 1 1934 2235 AA
## 4 2011 1 26 3 1905 2211 AA
## 5 2011 1 30 7 1856 2209 AA
## 6 2011 1 9 7 1938 2228 AS
## 7 2011 1 31 1 1919 2231 CO
## 8 2011 1 31 1 2116 2344 CO
## 9 2011 1 31 1 1850 2211 CO
## 10 2011 1 31 1 2102 2216 CO
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
filter(hflights, DepDelay > 0, ArrDelay < 0) # Flights that departed late but arrived ahead of schedule
## Source: local data frame [27,712 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 2 7 1401 1501 AA
## 2 2011 1 5 3 1405 1507 AA
## 3 2011 1 18 2 1408 1508 AA
## 4 2011 1 18 2 721 827 AA
## 5 2011 1 12 3 2015 2113 AA
## 6 2011 1 13 4 2020 2116 AA
## 7 2011 1 26 3 2009 2103 AA
## 8 2011 1 1 6 1631 1736 AA
## 9 2011 1 10 1 1639 1740 AA
## 10 2011 1 12 3 1631 1739 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
filter(hflights, Cancelled == 1, DepDelay > 0) # Flights that were cancelled after being delayed
## Source: local data frame [40 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 26 3 1926 NA CO
## 2 2011 1 11 2 1100 NA US
## 3 2011 1 19 3 1811 NA XE
## 4 2011 1 7 5 2028 NA XE
## 5 2011 2 4 5 1638 NA AA
## 6 2011 2 8 2 1057 NA CO
## 7 2011 2 2 3 802 NA XE
## 8 2011 2 9 3 904 NA XE
## 9 2011 2 1 2 1508 NA OO
## 10 2011 3 31 4 1016 NA CO
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
c1 <- filter(hflights, Dest == "JFK") # Flights that had JFK as their destination: c1
c2 <- mutate(c1, Date = paste(Year, Month, DayofMonth, sep="-")) # Create a Date column: c2
select(c2, Date, DepTime, ArrTime, TailNum) # Print out a selection of columns of c2
## Source: local data frame [695 x 4]
##
## Date DepTime ArrTime TailNum
## (chr) (int) (int) (chr)
## 1 2011-1-1 654 1124 N324JB
## 2 2011-1-1 1639 2110 N324JB
## 3 2011-1-2 703 1113 N324JB
## 4 2011-1-2 1604 2040 N324JB
## 5 2011-1-3 659 1100 N229JB
## 6 2011-1-3 1801 2200 N206JB
## 7 2011-1-4 654 1103 N267JB
## 8 2011-1-4 1608 2034 N267JB
## 9 2011-1-5 700 1103 N708JB
## 10 2011-1-5 1544 1954 N644JB
## .. ... ... ... ...
dtc <- filter(hflights, Cancelled == 1, !is.na(DepDelay)) # Definition of dtc
# Examples for arrange
arrange(dtc, DepDelay) # Arrange dtc by departure delays
## Source: local data frame [68 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 7 23 6 605 NA F9
## 2 2011 1 17 1 916 NA XE
## 3 2011 12 1 4 541 NA US
## 4 2011 10 12 3 2022 NA MQ
## 5 2011 7 29 5 1424 NA CO
## 6 2011 9 29 4 1639 NA OO
## 7 2011 2 9 3 555 NA MQ
## 8 2011 5 9 1 715 NA OO
## 9 2011 1 20 4 1413 NA UA
## 10 2011 1 17 1 831 NA WN
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
arrange(dtc, CancellationCode) # Arrange dtc so that cancellation reasons are grouped
## Source: local data frame [68 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 20 4 1413 NA UA
## 2 2011 1 7 5 2028 NA XE
## 3 2011 2 4 5 1638 NA AA
## 4 2011 2 8 2 1057 NA CO
## 5 2011 2 1 2 1508 NA OO
## 6 2011 2 21 1 2257 NA OO
## 7 2011 2 9 3 555 NA MQ
## 8 2011 3 18 5 727 NA UA
## 9 2011 4 4 1 1632 NA DL
## 10 2011 4 8 5 1608 NA WN
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
arrange(dtc, UniqueCarrier, DepDelay) # Arrange dtc according to carrier and departure delays
## Source: local data frame [68 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 8 18 4 1808 NA AA
## 2 2011 2 4 5 1638 NA AA
## 3 2011 7 29 5 1424 NA CO
## 4 2011 1 26 3 1703 NA CO
## 5 2011 8 11 4 1320 NA CO
## 6 2011 7 25 1 1654 NA CO
## 7 2011 1 26 3 1926 NA CO
## 8 2011 3 31 4 1016 NA CO
## 9 2011 2 8 2 1057 NA CO
## 10 2011 4 4 1 1632 NA DL
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
arrange(hflights, UniqueCarrier, desc(DepDelay)) # Arrange by carrier and decreasing departure delays
## Source: local data frame [227,496 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 12 12 1 650 808 AA
## 2 2011 11 19 6 1752 1910 AA
## 3 2011 12 22 4 1728 1848 AA
## 4 2011 10 23 7 2305 2 AA
## 5 2011 9 27 2 1206 1300 AA
## 6 2011 3 17 4 1647 1747 AA
## 7 2011 6 21 2 955 1315 AA
## 8 2011 5 20 5 2359 130 AA
## 9 2011 4 19 2 2023 2142 AA
## 10 2011 5 12 4 2133 53 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
arrange(hflights, DepDelay + ArrDelay) # Arrange flights by total delay (normal order)
## Source: local data frame [227,496 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 7 3 7 1914 2039 XE
## 2 2011 8 31 3 934 1039 OO
## 3 2011 8 21 7 935 1039 OO
## 4 2011 8 28 7 2059 2206 OO
## 5 2011 8 29 1 935 1041 OO
## 6 2011 12 25 7 741 926 OO
## 7 2011 1 30 7 620 812 OO
## 8 2011 8 3 3 1741 1810 XE
## 9 2011 8 4 4 930 1041 OO
## 10 2011 8 18 4 939 1043 OO
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
Additionally, examples for the summarize verb:
# Print out a summary with variables min_dist and max_dist
summarize(hflights, min_dist = min(Distance), max_dist = max(Distance))
## Source: local data frame [1 x 2]
##
## min_dist max_dist
## (int) (int)
## 1 79 3904
# Print out a summary with variable max_div
summarize(filter(hflights, Diverted == 1), max_div = max(Distance))
## Source: local data frame [1 x 1]
##
## max_div
## (int)
## 1 3904
# Remove rows that have NA ArrDelay: temp1
temp1 <- filter(hflights, !is.na(ArrDelay))
# Generate summary about ArrDelay column of temp1
summarize(temp1, earliest=min(ArrDelay), average=mean(ArrDelay), latest=max(ArrDelay), sd=sd(ArrDelay))
## Source: local data frame [1 x 4]
##
## earliest average latest sd
## (int) (dbl) (int) (dbl)
## 1 -70 7.094334 978 30.70852
# Keep rows that have no NA TaxiIn and no NA TaxiOut: temp2
temp2 <- filter(hflights, !is.na(TaxiIn), !is.na(TaxiOut))
# Print the maximum taxiing difference of temp2 with summarise()
summarize(temp2, max_taxi_diff = max(abs(TaxiIn - TaxiOut)))
## Source: local data frame [1 x 1]
##
## max_taxi_diff
## (int)
## 1 160
# Generate summarizing statistics for hflights
summarize(hflights, n_obs = n(), n_carrier = n_distinct(UniqueCarrier), n_dest = n_distinct(Dest))
## Source: local data frame [1 x 3]
##
## n_obs n_carrier n_dest
## (int) (int) (int)
## 1 227496 15 116
# All American Airline flights
aa <- filter(hflights, UniqueCarrier == "AA")
# Generate summarizing statistics for aa
summarize(aa, n_flights = n(), n_canc = sum(Cancelled), avg_delay = mean(ArrDelay, na.rm=TRUE))
## Source: local data frame [1 x 3]
##
## n_flights n_canc avg_delay
## (int) (int) (dbl)
## 1 3244 60 0.8917558
Additionally, examples for the pipe/chain as per magrittr:
# Find the average delta in taxi times
hflights %>%
mutate(diff = (TaxiOut - TaxiIn)) %>%
filter(!is.na(diff)) %>%
summarize(avg = mean(diff))
## Source: local data frame [1 x 1]
##
## avg
## (dbl)
## 1 8.992064
# Find flights that average less than 70 mph assuming 100 wasted minutes per flight
hflights %>%
mutate(RealTime = ActualElapsedTime + 100, mph = 60 * Distance / RealTime) %>%
filter(!is.na(mph), mph < 70) %>%
summarize(n_less = n(), n_dest = n_distinct(Dest), min_dist = min(Distance), max_dist = max(Distance))
## Source: local data frame [1 x 4]
##
## n_less n_dest min_dist max_dist
## (int) (int) (int) (int)
## 1 6726 13 79 305
# Find flights that average less than 105 mph, or that are diverted/cancelled
hflights %>%
mutate(RealTime = ActualElapsedTime + 100, mph = Distance / RealTime * 60) %>%
filter(mph < 105 | Cancelled == 1 | Diverted == 1) %>%
summarize(n_non = n(), n_dest = n_distinct(Dest), min_dist = min(Distance), max_dist = max(Distance))
## Source: local data frame [1 x 4]
##
## n_non n_dest min_dist max_dist
## (int) (int) (int) (int)
## 1 42400 113 79 3904
# Find overnight flights
filter(hflights, !is.na(DepTime), !is.na(ArrTime), DepTime > ArrTime) %>%
summarize(num = n())
## Source: local data frame [1 x 1]
##
## num
## (int)
## 1 2718
There is also the group_by capability, typically for use with summarize:
# Make an ordered per-carrier summary of hflights
group_by(hflights, UniqueCarrier) %>%
summarize(p_canc = 100 * mean(Cancelled, na.rm=TRUE), avg_delay = mean(ArrDelay, na.rm=TRUE)) %>%
arrange(avg_delay, p_canc)
## Source: local data frame [15 x 3]
##
## UniqueCarrier p_canc avg_delay
## (chr) (dbl) (dbl)
## 1 US 1.1268986 -0.6307692
## 2 AA 1.8495684 0.8917558
## 3 FL 0.9817672 1.8536239
## 4 AS 0.0000000 3.1923077
## 5 YV 1.2658228 4.0128205
## 6 DL 1.5903067 6.0841374
## 7 CO 0.6782614 6.0986983
## 8 MQ 2.9044750 7.1529751
## 9 EV 3.4482759 7.2569543
## 10 WN 1.5504047 7.5871430
## 11 F9 0.7159905 7.6682692
## 12 XE 1.5495599 8.1865242
## 13 OO 1.3946828 8.6934922
## 14 B6 2.5899281 9.8588410
## 15 UA 1.6409266 10.4628628
# Ordered overview of average arrival delays per carrier
hflights %>%
filter(!is.na(ArrDelay), ArrDelay > 0) %>%
group_by(UniqueCarrier) %>%
summarize(avg = mean(ArrDelay)) %>%
mutate(rank = rank(avg)) %>%
arrange(rank)
## Source: local data frame [15 x 3]
##
## UniqueCarrier avg rank
## (chr) (dbl) (dbl)
## 1 YV 18.67568 1
## 2 F9 18.68683 2
## 3 US 20.70235 3
## 4 CO 22.13374 4
## 5 AS 22.91195 5
## 6 OO 24.14663 6
## 7 XE 24.19337 7
## 8 WN 25.27750 8
## 9 FL 27.85693 9
## 10 AA 28.49740 10
## 11 DL 32.12463 11
## 12 UA 32.48067 12
## 13 MQ 38.75135 13
## 14 EV 40.24231 14
## 15 B6 45.47744 15
# How many airplanes only flew to one destination?
hflights %>%
group_by(TailNum) %>%
summarise(destPerTail = n_distinct(Dest)) %>%
filter(destPerTail == 1) %>%
summarise(nplanes=n())
## Source: local data frame [1 x 1]
##
## nplanes
## (int)
## 1 1526
# Find the most visited destination for each carrier
hflights %>%
group_by(UniqueCarrier, Dest) %>%
summarise(n = n()) %>%
mutate(rank = rank(-n)) %>%
filter(rank == 1)
## Source: local data frame [15 x 4]
## Groups: UniqueCarrier [15]
##
## UniqueCarrier Dest n rank
## (chr) (chr) (int) (dbl)
## 1 AA DFW 2105 1
## 2 AS SEA 365 1
## 3 B6 JFK 695 1
## 4 CO EWR 3924 1
## 5 DL ATL 2396 1
## 6 EV DTW 851 1
## 7 F9 DEN 837 1
## 8 FL ATL 2029 1
## 9 MQ DFW 2424 1
## 10 OO COS 1335 1
## 11 UA SFO 643 1
## 12 US CLT 2212 1
## 13 WN DAL 8243 1
## 14 XE CRP 3175 1
## 15 YV CLT 71 1
# Use summarise to calculate n_carrier
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, last
## The following object is masked from 'package:purrr':
##
## transpose
hflights2 <- as.data.table(hflights)
hflights2 %>%
summarize(n_carrier = n_distinct(UniqueCarrier))
## n_carrier
## 1: 15
And, dplyr can be used with databases, including writing the SQL query that matches to the dplyr request. The results are cached to avoid constantly pinging the server:
# Set up a connection to the mysql database
my_db <- src_mysql(dbname = "dplyr",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Reference a table within that source: nycflights
nycflights <- tbl(my_db, "dplyr")
# glimpse at nycflights
glimpse(nycflights)
## Observations: 336,776
## Variables: 17
## $ id (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ year (int) 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013...
## $ month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ day (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ dep_time (int) 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 55...
## $ dep_delay (int) 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2,...
## $ arr_time (int) 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 8...
## $ arr_delay (int) 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7,...
## $ carrier (chr) "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6"...
## $ tailnum (chr) "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N...
## $ flight (int) 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301...
## $ origin (chr) "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LG...
## $ dest (chr) "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IA...
## $ air_time (int) 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149...
## $ distance (int) 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 73...
## $ hour (int) 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6...
## $ minute (int) 17, 33, 42, 44, 54, 54, 55, 57, 57, 58, 58, 58, 58, ...
# Ordered, grouped summary of nycflights
nycflights %>%
group_by(carrier) %>%
summarize(n_flights = n(), avg_delay = mean(arr_delay)) %>%
arrange(avg_delay)
## Source: mysql 5.6.23-log [student@courses.csrrinzqubik.us-east-1.rds.amazonaws.com:/dplyr]
## From: <derived table> [?? x 3]
## Arrange: avg_delay
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
## carrier n_flights avg_delay
## (chr) (dbl) (dbl)
## 1 AS 714 -9.8613
## 2 HA 342 -6.9152
## 3 AA 32729 0.3556
## 4 DL 48110 1.6289
## 5 VX 5162 1.7487
## 6 US 20536 2.0565
## 7 UA 58665 3.5045
## 8 9E 18460 6.9135
## 9 B6 54635 9.3565
## 10 WN 12275 9.4675
## .. ... ... ...
The data.table library is designed to simplify and speed up work with large datasets. The language is broadly analogous to SQL, with syntax that includes equivalents for SELECT, WHERE, and GROUP BY. Some general attributes of a data.table object include:
NOTE - all data.table are also data.frame, and if a package is not aware of data.table, then it will act as data.frame for that package.
General syntax is:
Example table creation:
Some example code includes:
library(data.table)
DT <- data.table(a = c(1, 2), b=LETTERS[1:4])
str(DT)
## Classes 'data.table' and 'data.frame': 4 obs. of 2 variables:
## $ a: num 1 2 1 2
## $ b: chr "A" "B" "C" "D"
## - attr(*, ".internal.selfref")=<externalptr>
DT
## a b
## 1: 1 A
## 2: 2 B
## 3: 1 C
## 4: 2 D
# Print the second to last row of DT using .N
DT[.N-1]
## a b
## 1: 1 C
# Print the column names of DT
names(DT)
## [1] "a" "b"
# Print the number or rows and columns of DT
dim(DT)
## [1] 4 2
# Select row 2 twice and row 3, returning a data.table with three rows where row 2 is a duplicate of row 1.
DT[c(2, 2:3)]
## a b
## 1: 2 B
## 2: 2 B
## 3: 1 C
DT <- data.table(A = 1:5, B = letters[1:5], C = 6:10)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 3 variables:
## $ A: int 1 2 3 4 5
## $ B: chr "a" "b" "c" "d" ...
## $ C: int 6 7 8 9 10
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: 1 a 6
## 2: 2 b 7
## 3: 3 c 8
## 4: 4 d 9
## 5: 5 e 10
# Subset rows 1 and 3, and columns B and C
DT[c(1, 3), .(B, C)]
## B C
## 1: a 6
## 2: c 8
# Assign to ans the correct value
ans <- DT[ , .(B, val=A*C)]
ans
## B val
## 1: a 6
## 2: b 14
## 3: c 24
## 4: d 36
## 5: e 50
# Fill in the blanks such that ans2 equals target
target <- data.table(B = c("a", "b", "c", "d", "e", "a", "b", "c", "d", "e"),
val = as.integer(c(6:10, 1:5))
)
ans2 <- DT[, .(B, val = c(C, A))]
identical(target, ans2)
## [1] TRUE
DT <- as.data.table(iris)
str(DT)
## Classes 'data.table' and 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# For each Species, print the mean Sepal.Length
DT[ , mean(Sepal.Length), Species]
## Species V1
## 1: setosa 5.006
## 2: versicolor 5.936
## 3: virginica 6.588
# Print mean Sepal.Length, grouping by first letter of Species
DT[ , mean(Sepal.Length), substr(Species, 1, 1)]
## substr V1
## 1: s 5.006
## 2: v 6.262
str(DT)
## Classes 'data.table' and 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
identical(DT, as.data.table(iris))
## [1] TRUE
# Group the specimens by Sepal area (to the nearest 10 cm2) and count how many occur in each group.
DT[, .N, by = 10 * round(Sepal.Length * Sepal.Width / 10)]
## round N
## 1: 20 117
## 2: 10 29
## 3: 30 4
# Now name the output columns `Area` and `Count`
DT[, .(Count=.N), by = .(Area = 10 * round(Sepal.Length * Sepal.Width / 10))]
## Area Count
## 1: 20 117
## 2: 10 29
## 3: 30 4
# Create the data.table DT
set.seed(1L)
DT <- data.table(A = rep(letters[2:1], each = 4L),
B = rep(1:4, each = 2L),
C = sample(8)
)
str(DT)
## Classes 'data.table' and 'data.frame': 8 obs. of 3 variables:
## $ A: chr "b" "b" "b" "b" ...
## $ B: int 1 1 2 2 3 3 4 4
## $ C: int 3 8 4 5 1 7 2 6
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: b 1 3
## 2: b 1 8
## 3: b 2 4
## 4: b 2 5
## 5: a 3 1
## 6: a 3 7
## 7: a 4 2
## 8: a 4 6
# Create the new data.table, DT2
DT2 <- DT[, .(C = cumsum(C)), by = .(A, B)]
str(DT2)
## Classes 'data.table' and 'data.frame': 8 obs. of 3 variables:
## $ A: chr "b" "b" "b" "b" ...
## $ B: int 1 1 2 2 3 3 4 4
## $ C: int 3 11 4 9 1 8 2 8
## - attr(*, ".internal.selfref")=<externalptr>
DT2
## A B C
## 1: b 1 3
## 2: b 1 11
## 3: b 2 4
## 4: b 2 9
## 5: a 3 1
## 6: a 3 8
## 7: a 4 2
## 8: a 4 8
# Select from DT2 the last two values from C while you group by A
DT2[, .(C = tail(C, 2)), by = A]
## A C
## 1: b 4
## 2: b 9
## 3: a 2
## 4: a 8
The chaining operation in data.table is run as [statement][next statement].
Example code includes:
set.seed(1L)
DT <- data.table(A = rep(letters[2:1], each = 4L),
B = rep(1:4, each = 2L),
C = sample(8))
str(DT)
## Classes 'data.table' and 'data.frame': 8 obs. of 3 variables:
## $ A: chr "b" "b" "b" "b" ...
## $ B: int 1 1 2 2 3 3 4 4
## $ C: int 3 8 4 5 1 7 2 6
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: b 1 3
## 2: b 1 8
## 3: b 2 4
## 4: b 2 5
## 5: a 3 1
## 6: a 3 7
## 7: a 4 2
## 8: a 4 6
# Perform operation using chaining
DT[ , .(C = cumsum(C)), by = .(A, B)][ , .(C = tail(C, 2)), by=.(A)]
## A C
## 1: b 4
## 2: b 9
## 3: a 2
## 4: a 8
data(iris)
DT <- as.data.table(iris)
str(DT)
## Classes 'data.table' and 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Perform chained operations on DT
DT[ , .(Sepal.Length = median(Sepal.Length), Sepal.Width = median(Sepal.Width),
Petal.Length = median(Petal.Length), Petal.Width = median(Petal.Width)),
by=.(Species)][order(-Species)]
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1: virginica 6.5 3.0 5.55 2.0
## 2: versicolor 5.9 2.8 4.35 1.3
## 3: setosa 5.0 3.4 1.50 0.2
# Mean of columns
# DT[ , lapply(.SD, FUN=mean), by=.(x)]
# Median of columns
# DT[ , lapply(.SD, FUN=median), by=.(x)]
# Calculate the sum of the Q columns
# DT[ , lapply(.SD, FUN=sum), , .SDcols=2:4]
# Calculate the sum of columns H1 and H2
# DT[ , lapply(.SD, FUN=sum), , .SDcols=paste0("H", 1:2)]
# Select all but the first row of groups 1 and 2, returning only the grp column and the Q columns
# foo = function(x) { x[-1] }
# DT[ , lapply(.SD, FUN=foo), by=.(grp), .SDcols=paste0("Q", 1:3)]
# Sum of all columns and the number of rows
# DT[, c(lapply(.SD, FUN=sum), .N), by=.(x), .SDcols=names(DT)]
# Cumulative sum of column x and y while grouping by x and z > 8
# DT[, lapply(.SD, FUN=cumsum), by=.(by1=x, by2=(z>8)), .SDcols=c("x", "y")]
# Chaining
# DT[, lapply(.SD, FUN=cumsum), by=.(by1=x, by2=(z>8)), .SDcols=c("x", "y")][ , lapply(.SD, FUN=max), by=.(by1), .SDcols=c("x", "y")]
# The data.table DT
DT <- data.table(A = letters[c(1, 1, 1, 2, 2)], B = 1:5)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables:
## $ A: chr "a" "a" "a" "b" ...
## $ B: int 1 2 3 4 5
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B
## 1: a 1
## 2: a 2
## 3: a 3
## 4: b 4
## 5: b 5
# Add column by reference: Total
DT[ , Total:=sum(B), by=.(A)]
DT
## A B Total
## 1: a 1 6
## 2: a 2 6
## 3: a 3 6
## 4: b 4 9
## 5: b 5 9
# Add 1 to column B
DT[c(2,4) , B:=B+1L, ]
DT
## A B Total
## 1: a 1 6
## 2: a 3 6
## 3: a 3 6
## 4: b 5 9
## 5: b 5 9
# Add a new column Total2
DT[2:4, Total2:=sum(B), by=.(A)]
DT
## A B Total Total2
## 1: a 1 6 NA
## 2: a 3 6 6
## 3: a 3 6 6
## 4: b 5 9 5
## 5: b 5 9 NA
# Remove the Total column
DT[ , Total := NULL, ]
DT
## A B Total2
## 1: a 1 NA
## 2: a 3 6
## 3: a 3 6
## 4: b 5 5
## 5: b 5 NA
# Select the third column using `[[`
DT[[3]]
## [1] NA 6 6 5 NA
# A data.table DT has been created for you
DT <- data.table(A = c(1, 1, 1, 2, 2), B = 1:5)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables:
## $ A: num 1 1 1 2 2
## $ B: int 1 2 3 4 5
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B
## 1: 1 1
## 2: 1 2
## 3: 1 3
## 4: 2 4
## 5: 2 5
# Update B, add C and D
DT[ , c("B", "C", "D") := .(B + 1, A + B, 2), ]
DT
## A B C D
## 1: 1 2 2 2
## 2: 1 3 3 2
## 3: 1 4 4 2
## 4: 2 5 6 2
## 5: 2 6 7 2
# Delete my_cols
my_cols <- c("B", "C")
DT[ , (my_cols) := NULL, ]
DT
## A D
## 1: 1 2
## 2: 1 2
## 3: 1 2
## 4: 2 2
## 5: 2 2
# Delete column 2 by number
DT[[2]] <- NULL
DT
## A
## 1: 1
## 2: 1
## 3: 1
## 4: 2
## 5: 2
# Set the seed
# set.seed(1)
# Check the DT that is made available to you
# DT
# For loop with set
# for(i in 2:4) { set(DT, sample(nrow(DT), 3), i, NA) }
# Change the column names to lowercase
# setnames(DT, letters[1:4])
# Print the resulting DT to the console
# DT
# Define DT
DT <- data.table(a = letters[c(1, 1, 1, 2, 2)], b = 1)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables:
## $ a: chr "a" "a" "a" "b" ...
## $ b: num 1 1 1 1 1
## - attr(*, ".internal.selfref")=<externalptr>
DT
## a b
## 1: a 1
## 2: a 1
## 3: a 1
## 4: b 1
## 5: b 1
# Add a suffix "_2" to all column names
setnames(DT, paste0(names(DT), "_2"))
DT
## a_2 b_2
## 1: a 1
## 2: a 1
## 3: a 1
## 4: b 1
## 5: b 1
# Change column name "a_2" to "A2"
setnames(DT, "a_2", "A2")
DT
## A2 b_2
## 1: a 1
## 2: a 1
## 3: a 1
## 4: b 1
## 5: b 1
# Reverse the order of the columns
setcolorder(DT, 2:1)
DT
## b_2 A2
## 1: 1 a
## 2: 1 a
## 3: 1 a
## 4: 1 b
## 5: 1 b
Example code includes:
# iris as a data.table
iris <- as.data.table(iris)
# Remove the "Sepal." prefix
names(iris) <- gsub("Sepal\\.", "", names(iris))
# Remove the two columns starting with "Petal"
iris[, c("Petal.Length", "Petal.Width") := NULL, ]
# Cleaned up iris data.table
str(iris)
## Classes 'data.table' and 'data.frame': 150 obs. of 3 variables:
## $ Length : num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Area is greater than 20 square centimeters
iris[ Width * Length > 20 ]
## Length Width Species
## 1: 5.4 3.9 setosa
## 2: 5.8 4.0 setosa
## 3: 5.7 4.4 setosa
## 4: 5.4 3.9 setosa
## 5: 5.7 3.8 setosa
## 6: 5.2 4.1 setosa
## 7: 5.5 4.2 setosa
## 8: 7.0 3.2 versicolor
## 9: 6.4 3.2 versicolor
## 10: 6.9 3.1 versicolor
## 11: 6.3 3.3 versicolor
## 12: 6.7 3.1 versicolor
## 13: 6.7 3.0 versicolor
## 14: 6.0 3.4 versicolor
## 15: 6.7 3.1 versicolor
## 16: 6.3 3.3 virginica
## 17: 7.1 3.0 virginica
## 18: 7.6 3.0 virginica
## 19: 7.3 2.9 virginica
## 20: 7.2 3.6 virginica
## 21: 6.5 3.2 virginica
## 22: 6.8 3.0 virginica
## 23: 6.4 3.2 virginica
## 24: 7.7 3.8 virginica
## 25: 7.7 2.6 virginica
## 26: 6.9 3.2 virginica
## 27: 7.7 2.8 virginica
## 28: 6.7 3.3 virginica
## 29: 7.2 3.2 virginica
## 30: 7.2 3.0 virginica
## 31: 7.4 2.8 virginica
## 32: 7.9 3.8 virginica
## 33: 7.7 3.0 virginica
## 34: 6.3 3.4 virginica
## 35: 6.9 3.1 virginica
## 36: 6.7 3.1 virginica
## 37: 6.9 3.1 virginica
## 38: 6.8 3.2 virginica
## 39: 6.7 3.3 virginica
## 40: 6.7 3.0 virginica
## 41: 6.2 3.4 virginica
## Length Width Species
# Add new boolean column
iris[, is_large := Width * Length > 25]
## Warning in `[.data.table`(iris, , `:=`(is_large, Width * Length > 25)):
## Invalid .internal.selfref detected and fixed by taking a (shallow) copy
## of the data.table so that := can add this new column by reference. At
## an earlier point, this data.table has been copied by R (or been created
## manually using structure() or similar). Avoid key<-, names<- and attr<-
## which in R currently (and oddly) may copy the whole data.table. Use set*
## syntax instead to avoid copying: ?set, ?setnames and ?setattr. Also, in
## R<=v3.0.2, list(DT1,DT2) copied the entire DT1 and DT2 (R's list() used to
## copy named objects); please upgrade to R>v3.0.2 if that is biting. If this
## message doesn't help, please report to datatable-help so the root cause can
## be fixed.
# Now large observations with is_large
iris[is_large == TRUE]
## Length Width Species is_large
## 1: 5.7 4.4 setosa TRUE
## 2: 7.2 3.6 virginica TRUE
## 3: 7.7 3.8 virginica TRUE
## 4: 7.9 3.8 virginica TRUE
iris[(is_large)] # Also OK
## Length Width Species is_large
## 1: 5.7 4.4 setosa TRUE
## 2: 7.2 3.6 virginica TRUE
## 3: 7.7 3.8 virginica TRUE
## 4: 7.9 3.8 virginica TRUE
# The 'keyed' data.table DT
DT <- data.table(A = letters[c(2, 1, 2, 3, 1, 2, 3)],
B = c(5, 4, 1, 9, 8, 8, 6),
C = 6:12)
setkey(DT, A, B)
str(DT)
## Classes 'data.table' and 'data.frame': 7 obs. of 3 variables:
## $ A: chr "a" "a" "b" "b" ...
## $ B: num 4 8 1 5 8 6 9
## $ C: int 7 10 8 6 11 12 9
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "A" "B"
DT
## A B C
## 1: a 4 7
## 2: a 8 10
## 3: b 1 8
## 4: b 5 6
## 5: b 8 11
## 6: c 6 12
## 7: c 9 9
# Select the "b" group
DT["b"]
## A B C
## 1: b 1 8
## 2: b 5 6
## 3: b 8 11
# "b" and "c" groups
DT[c("b", "c")]
## A B C
## 1: b 1 8
## 2: b 5 6
## 3: b 8 11
## 4: c 6 12
## 5: c 9 9
# The first row of the "b" and "c" groups
DT[c("b", "c"), mult = "first"]
## A B C
## 1: b 1 8
## 2: c 6 12
# First and last row of the "b" and "c" groups
DT[c("b", "c"), .SD[c(1, .N)], by = .EACHI]
## A B C
## 1: b 1 8
## 2: b 8 11
## 3: c 6 12
## 4: c 9 9
# Copy and extend code for instruction 4: add printout
DT[c("b", "c"), { print(.SD); .SD[c(1, .N)] }, by = .EACHI]
## B C
## 1: 1 8
## 2: 5 6
## 3: 8 11
## B C
## 1: 6 12
## 2: 9 9
## A B C
## 1: b 1 8
## 2: b 8 11
## 3: c 6 12
## 4: c 9 9
# Keyed data.table DT
DT <- data.table(A = letters[c(2, 1, 2, 3, 1, 2, 3)],
B = c(5, 4, 1, 9, 8, 8, 6),
C = 6:12,
key = "A,B")
str(DT)
## Classes 'data.table' and 'data.frame': 7 obs. of 3 variables:
## $ A: chr "a" "a" "b" "b" ...
## $ B: num 4 8 1 5 8 6 9
## $ C: int 7 10 8 6 11 12 9
## - attr(*, "sorted")= chr "A" "B"
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: a 4 7
## 2: a 8 10
## 3: b 1 8
## 4: b 5 6
## 5: b 8 11
## 6: c 6 12
## 7: c 9 9
# Get the key of DT
key(DT)
## [1] "A" "B"
# Row where A == "b" and B == 6
DT[.("b", 6)]
## A B C
## 1: b 6 NA
# Return the prevailing row
DT[.("b", 6), roll=TRUE]
## A B C
## 1: b 6 6
# Return the nearest row
DT[.("b", 6), roll="nearest"]
## A B C
## 1: b 6 6
# Keyed data.table DT
DT <- data.table(A = letters[c(2, 1, 2, 3, 1, 2, 3)],
B = c(5, 4, 1, 9, 8, 8, 6),
C = 6:12,
key = "A,B")
str(DT)
## Classes 'data.table' and 'data.frame': 7 obs. of 3 variables:
## $ A: chr "a" "a" "b" "b" ...
## $ B: num 4 8 1 5 8 6 9
## $ C: int 7 10 8 6 11 12 9
## - attr(*, "sorted")= chr "A" "B"
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: a 4 7
## 2: a 8 10
## 3: b 1 8
## 4: b 5 6
## 5: b 8 11
## 6: c 6 12
## 7: c 9 9
# Print the sequence (-2):10 for the "b" group
DT[.("b", (-2):10)]
## A B C
## 1: b -2 NA
## 2: b -1 NA
## 3: b 0 NA
## 4: b 1 8
## 5: b 2 NA
## 6: b 3 NA
## 7: b 4 NA
## 8: b 5 6
## 9: b 6 NA
## 10: b 7 NA
## 11: b 8 11
## 12: b 9 NA
## 13: b 10 NA
# Add code: carry the prevailing values forwards
DT[.("b", (-2):10), roll=TRUE]
## A B C
## 1: b -2 NA
## 2: b -1 NA
## 3: b 0 NA
## 4: b 1 8
## 5: b 2 8
## 6: b 3 8
## 7: b 4 8
## 8: b 5 6
## 9: b 6 6
## 10: b 7 6
## 11: b 8 11
## 12: b 9 11
## 13: b 10 11
# Add code: carry the first observation backwards
DT[.("b", (-2):10), roll=TRUE, rollends=TRUE]
## A B C
## 1: b -2 8
## 2: b -1 8
## 3: b 0 8
## 4: b 1 8
## 5: b 2 8
## 6: b 3 8
## 7: b 4 8
## 8: b 5 6
## 9: b 6 6
## 10: b 7 6
## 11: b 8 11
## 12: b 9 11
## 13: b 10 11
Data visulaization is the combination of Statistics and Design:
The Anscombe plot examples show four different datasets explained by the identical linear model. This reinforces the importance of plotting the data prior to running analyses and drawing conclusions.
The “Grammar of Graphics” is a plotting framework based on the book by Leland Wilkinson, “Grammar of Graphics” (1999). The gist is that graphics are made of distinct layers of grammatical elements. Meaningful plots are created through aesthetic mapping.
Essential Grammatical Elements include:
The ggplot2 package was one of the first developed and designed by Hadley Wickham. It implements the “Grammar of Graphics” in R, for example with:
The Anscombe data is good to have plotted for reference:
library(ggplot2)
data(anscombe)
ansX <- with(anscombe, c(x1, x2, x3, x4))
ansY <- with(anscombe, c(y1, y2, y3, y4))
ansType <- rep(1:4, each=nrow(anscombe))
ansFrame <- data.frame(x=ansX, y=ansY, series=factor(ansType))
# ggplot example for Anscombe data
ggplot(ansFrame, aes(x=x, y=y)) +
geom_point() +
geom_smooth(method="lm", col="red", se=FALSE, fullrange=TRUE) +
facet_wrap(~ series, nrow=2)
As well, the basic example code from above is useful to explore:
data(iris)
ggplot(iris, aes(x=Sepal.Length, y=Sepal.Width)) +
geom_jitter(alpha = 0.6) +
facet_grid(. ~ Species) +
stat_smooth(method = "lm", se = FALSE, col="red")
Some additional basic ggplot syntax includes (cached, since plotting each point of the diamonds dataset is taxing for the graphics):
# Explore the mtcars data frame with str()
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Execute the following command
ggplot(mtcars, aes(x = cyl, y = mpg)) +
geom_point()
# Change the command below so that cyl is treated as factor
ggplot(mtcars, aes(x = factor(cyl), y = mpg)) +
geom_point()
# A scatter plot has been made for you
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point()
# Replace ___ with the correct vector
ggplot(mtcars, aes(x = wt, y = mpg, col = disp)) +
geom_point()
# Replace ___ with the correct vector
ggplot(mtcars, aes(x = wt, y = mpg, size = disp)) +
geom_point()
# Explore the diamonds data frame with str()
data(diamonds)
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# Add geom_point() with +
ggplot(diamonds, aes(x = carat, y = price)) + geom_point()
# Add geom_point() and geom_smooth() with +
ggplot(diamonds, aes(x = carat, y = price)) + geom_point() + geom_smooth()
# The plot you created in the previous exercise
ggplot(diamonds, aes(x = carat, y = price)) +
geom_point() +
geom_smooth()
# Copy the above command but show only the smooth line
ggplot(diamonds, aes(x = carat, y = price)) +
geom_smooth()
# Copy the above command and assign the correct value to col in aes()
ggplot(diamonds, aes(x = carat, y = price, col=clarity)) +
geom_smooth()
# Keep the color settings from previous command. Plot only the points with argument alpha.
ggplot(diamonds, aes(x = carat, y = price, col=clarity)) +
geom_point(alpha = 0.4)
# Create the object containing the data and aes layers: dia_plot
dia_plot <- ggplot(diamonds, aes(x = carat, y=price))
# Add a geom layer with + and geom_point()
dia_plot + geom_point()
# Add the same geom layer, but with aes() inside
dia_plot + geom_point(aes(col = clarity))
set.seed(1)
# The dia_plot object has been created for you
dia_plot <- ggplot(diamonds, aes(x = carat, y = price))
# Expand dia_plot by adding geom_point() with alpha set to 0.2
dia_plot <- dia_plot + geom_point(alpha = 0.2)
# Plot dia_plot with additional geom_smooth() with se set to FALSE
dia_plot + geom_smooth(se = FALSE)
# Copy the command from above and add aes() with the correct mapping to geom_smooth()
dia_plot + geom_smooth(se = FALSE, aes(col = clarity))
Data Layer - How data structure influences plots (ggplot2 vs. base):
Can add some additional points (similar to points() in base, but with axes rescaling for you):
Example code includes:
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Plot the correct variables of mtcars
plot(mtcars$wt, mtcars$mpg, col=mtcars$cyl)
# Change cyl inside mtcars to a factor
mtcars$cyl <- as.factor(mtcars$cyl)
# Make the same plot as in the first instruction
plot(mtcars$wt, mtcars$mpg, col=mtcars$cyl)
# Use lm() to calculate a linear model and save it as carModel
carModel <- lm(mpg ~ wt, data = mtcars)
# Call abline() with carModel as first argument and set lty to 2
abline(carModel, lty=2)
# Plot each subset efficiently with lapply
# You don't have to edit this code
lapply(mtcars$cyl, function(x) {
abline(lm(mpg ~ wt, mtcars, subset = (cyl == x)), col = x)
})
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## NULL
##
## [[13]]
## NULL
##
## [[14]]
## NULL
##
## [[15]]
## NULL
##
## [[16]]
## NULL
##
## [[17]]
## NULL
##
## [[18]]
## NULL
##
## [[19]]
## NULL
##
## [[20]]
## NULL
##
## [[21]]
## NULL
##
## [[22]]
## NULL
##
## [[23]]
## NULL
##
## [[24]]
## NULL
##
## [[25]]
## NULL
##
## [[26]]
## NULL
##
## [[27]]
## NULL
##
## [[28]]
## NULL
##
## [[29]]
## NULL
##
## [[30]]
## NULL
##
## [[31]]
## NULL
##
## [[32]]
## NULL
# This code will draw the legend of the plot
# You don't have to edit this code
legend(x = 5, y = 33, legend = levels(mtcars$cyl),
col = 1:3, pch = 1, bty = "n")
# Plot 1: add geom_point() to this command to create a scatter plot
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl)) +
geom_point() # Fill in using instructions Plot 1
# Plot 2: include the lines of the linear models, per cyl
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl)) +
geom_point() + # Copy from Plot 1
geom_smooth(method="lm", se=FALSE) # Fill in using instructions Plot 2
# Plot 3: include a lm for the entire dataset in its whole
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl)) +
geom_point() + # Copy from Plot 2
geom_smooth(method="lm", se=FALSE) + # Copy from Plot 2
geom_smooth(aes(group = 1), method="lm", se=FALSE, linetype = 2) # Fill in using instructions Plot 3
data(iris)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# Option 1
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +
geom_point() +
geom_point(aes(x = Petal.Length, y = Petal.Width), col = "red")
# DS code to match up to lecturer data set formats
data(iris)
longIris <- tidyr::gather(iris, Type, Measure, -Species)
intIris <- tidyr::separate(longIris, Type, c("Part", "Metric"))
intIris$rowNum <- c(1:150, 1:150, 151:300, 151:300)
iris.wide <- tidyr::spread(intIris, Metric, Measure)
iris.tidy <- dplyr::select(dplyr::mutate(intIris, Value=Measure, Measure=Metric), Species, Part, Measure, Value)
# Option 2
ggplot(iris.wide, aes(x = Length, y = Width, col = Part)) +
geom_point()
# Consider the structure of iris, iris.wide and iris.tidy (in that order)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
str(iris.wide)
## 'data.frame': 300 obs. of 5 variables:
## $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Part : chr "Petal" "Petal" "Petal" "Petal" ...
## $ rowNum : int 151 152 153 154 155 156 157 158 159 160 ...
## $ Length : num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
str(iris.tidy)
## 'data.frame': 600 obs. of 4 variables:
## $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Part : chr "Sepal" "Sepal" "Sepal" "Sepal" ...
## $ Measure: chr "Length" "Length" "Length" "Length" ...
## $ Value : num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
# Think about which dataset you would use to get the plot shown right
# Fill in the ___ to produce the plot given to the right
ggplot(iris.tidy, aes(x = Species, y = Value, col = Part)) +
geom_jitter() +
facet_grid(. ~ Measure)
# Load the tidyr package
library(tidyr)
# Fill in the ___ to produce to the correct iris.tidy dataset
iris.tidy <- iris %>%
gather(key, Value, -Species) %>%
separate(key, c("Part", "Measure"), "\\.")
# Consider the head of iris, iris.wide and iris.tidy (in that order)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
head(iris.wide)
## Species Part rowNum Length Width
## 1 setosa Petal 151 1.4 0.2
## 2 setosa Petal 152 1.4 0.2
## 3 setosa Petal 153 1.3 0.2
## 4 setosa Petal 154 1.5 0.2
## 5 setosa Petal 155 1.4 0.2
## 6 setosa Petal 156 1.7 0.4
head(iris.tidy)
## Species Part Measure Value
## 1 setosa Sepal Length 5.1
## 2 setosa Sepal Length 4.9
## 3 setosa Sepal Length 4.7
## 4 setosa Sepal Length 4.6
## 5 setosa Sepal Length 5.0
## 6 setosa Sepal Length 5.4
# Think about which dataset you would use to get the plot shown right
# Fill in the ___ to produce the plot given to the right
ggplot(iris.wide, aes(x = Length, y = Width, col = Part)) +
geom_jitter() +
facet_grid(. ~ Species)
# Add column with unique ids (don't need to change)
iris$Flower <- 1:nrow(iris)
# Fill in the ___ to produce to the correct iris.wide dataset
iris.wide <- iris %>%
gather(key, value, -Species, -Flower) %>%
separate(key, c("Part", "Measure"), "\\.") %>%
spread(Measure, value)
Visible aesthetics are the cornerstone of the ggplot:
Modifying aesthetics:
Best practices for choosing among the aesthetics (though note that “there is a fair bit of creativity involved”):
Example code includes:
data(mtcars)
mtcars$cyl <- as.factor(mtcars$cyl)
mtcars$am <- as.factor(mtcars$am)
# Map cyl to y
ggplot(mtcars, aes(x=mpg, y=cyl)) + geom_point()
# Map cyl to x
ggplot(mtcars, aes(y=mpg, x=cyl)) + geom_point()
# Map cyl to col
ggplot(mtcars, aes(y=mpg, x=wt, col=cyl)) + geom_point()
# Change shape and size of the points in the above plot
ggplot(mtcars, aes(y=mpg, x=wt, col=cyl)) + geom_point(shape=1, size=4)
# Map cyl to fill
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl)) +
geom_point()
# Change shape, size and alpha of the points in the above plot
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl)) +
geom_point(shape=16, size=6, alpha=0.6)
# Map cyl to size
ggplot(mtcars, aes(y=mpg, x=wt, size=cyl)) + geom_point()
# Map cyl to alpha
ggplot(mtcars, aes(y=mpg, x=wt, alpha=cyl)) + geom_point()
# Map cyl to shape
ggplot(mtcars, aes(y=mpg, x=wt, shape=cyl)) + geom_point()
# Map cyl to labels
ggplot(mtcars, aes(y=mpg, x=wt, label=cyl)) + geom_text()
# Define a hexadecimal color
my_color <- "#123456"
# Set the color aesthetic
ggplot(mtcars, aes(x=wt, y=mpg, col=cyl)) + geom_point()
# Set the color aesthetic and attribute
ggplot(mtcars, aes(x=wt, y=mpg, col=cyl)) + geom_point(col = my_color)
# Set the fill aesthetic and color, size and shape attributes
ggplot(mtcars, aes(x=wt, y=mpg, fill=cyl)) + geom_point(size=10, shape=23, col=my_color)
# Expand to draw points with alpha 0.5
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl)) + geom_point(alpha=0.5)
# Expand to draw points with shape 24 and color yellow
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl)) + geom_point(shape=24, col="yellow")
# Expand to draw text with label x, color red and size 10
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl)) + geom_text(label="x", col="red", size=10)
# Map mpg onto x, qsec onto y and factor(cyl) onto col
ggplot(mtcars, aes(x=mpg, y=qsec, col=factor(cyl))) + geom_point()
# Add mapping: factor(am) onto shape
ggplot(mtcars, aes(x=mpg, y=qsec, col=factor(cyl), shape=factor(am))) + geom_point()
# Add mapping: (hp/wt) onto size
ggplot(mtcars, aes(x=mpg, y=qsec, col=factor(cyl), shape=factor(am), size=(hp/wt))) + geom_point()
# Basic scatter plot: wt on x-axis and mpg on y-axis; map cyl to col
ggplot(mtcars, aes(x=wt, y=mpg, col=cyl)) + geom_point(size=4)
# Hollow circles - an improvement
ggplot(mtcars, aes(x=wt, y=mpg, col=cyl)) + geom_point(size=4, shape=1)
# Add transparency - very nice
ggplot(mtcars, aes(x=wt, y=mpg, col=cyl)) + geom_point(size=4, alpha=0.6)
Next, bar plots are examined using the same data:
cyl.am <- ggplot(mtcars, aes(x = factor(cyl), fill = factor(am)))
# Add geom (position = "stack" by default)
cyl.am + geom_bar()
# Fill - show proportion
cyl.am +
geom_bar(position = "fill")
# Dodging - principles of similarity and proximity
cyl.am +
geom_bar(position = "dodge")
# Clean up the axes with scale_ functions
val = c("#E41A1C", "#377EB8")
lab = c("Manual", "Automatic")
cyl.am +
geom_bar(position = "dodge") +
scale_x_discrete("Cylinders") +
scale_y_continuous("Number") +
scale_fill_manual("Transmission",
values = val,
labels = lab)
# Add a new column called group
mtcars$group <- 0
# Create jittered plot of mtcars: mpg onto x, group onto y
ggplot(mtcars, aes(x = mpg, y=group)) + geom_jitter()
# Change the y aesthetic limits
ggplot(mtcars, aes(x = mpg, y=group)) + geom_jitter() + scale_y_continuous(limits = c(-2, 2))
Further, the diamonds data set is explored to show techniques for minimizing over-plotting problems. Per previous, it is cached due to the lengthy plot times driven by the many data points:
data(diamonds)
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# Scatter plot: carat (x), price (y), clarity (col)
ggplot(diamonds, aes(x=carat, y=price, col=clarity)) + geom_point()
# Adjust for overplotting
ggplot(diamonds, aes(x=carat, y=price, col=clarity)) + geom_point(alpha = 0.5)
# Scatter plot: clarity (x), carat (y), price (col)
ggplot(diamonds, aes(y=carat, x=clarity, col=price)) + geom_point(alpha = 0.5)
# Dot plot with jittering
ggplot(diamonds, aes(y=carat, x=clarity, col=price)) + geom_point(alpha = 0.5, position="jitter")
The geometries layer includes the most common plot types:
Scatter plots examples - geom_point(), geom_jitter(), geom_abline():
Bar plots examples - histogram, bar, errorbar:
Line plots examples - line:
Example code from mtcars includes:
# mtcars point plots
# Plot the cyl on the x-axis and wt on the y-axis
ggplot(mtcars, aes(x=cyl, y=wt)) + geom_point()
# Use geom_jitter() instead of geom_point()
ggplot(mtcars, aes(x=cyl, y=wt)) + geom_jitter()
# Define the position object using position_jitter(): posn.j
posn.j <- position_jitter(width = 0.1)
# Use posn.j in geom_point()
ggplot(mtcars, aes(x=cyl, y=wt)) + geom_point(position = posn.j)
# mtcars bar plots
# Make a univariate histogram
ggplot(mtcars, aes(x=mpg)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Change the bin width to 1
ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth = 1)
# Change the y aesthetic to density
ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth = 1, aes(y=..density..))
# Custom color code
myBlue <- "#377EB8"
# Change the fill color to myBlue
ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth = 1, aes(y=..density..), fill=myBlue)
# Draw a bar plot of cyl, filled according to am
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar()
# Change the position argument to stack
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar(position="stack")
# Change the position argument to fill
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar(position="fill")
# Change the position argument to dodge
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar(position="dodge")
# Draw a bar plot of cyl, filled according to am
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar()
# Change the position argument to "dodge"
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar(position = "dodge")
# Define posn_d with position_dodge()
posn_d <- position_dodge(width=0.2)
# Change the position argument to posn_d
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar(position = posn_d)
# Use posn_d as position and adjust alpha to 0.6
ggplot(mtcars, aes(x=cyl, fill=am)) + geom_bar(position = posn_d, alpha=0.6)
# A basic histogram, add coloring defined by cyl
ggplot(mtcars, aes(x=mpg, fill=cyl)) +
geom_histogram(binwidth = 1)
# Change position to identity
ggplot(mtcars, aes(x=mpg, fill=cyl)) +
geom_histogram(binwidth = 1, position="identity")
# Change geom to freqpoly (position is identity by default)
ggplot(mtcars, aes(x=mpg, col=cyl)) +
geom_freqpoly(binwidth = 1, position="identity")
# Example of how to use a brewed color palette
ggplot(mtcars, aes(x = cyl, fill = am)) +
geom_bar() +
scale_fill_brewer(palette = "Set1")
# Basic histogram plot command
ggplot(mtcars, aes(mpg)) +
geom_histogram(binwidth = 1)
# Expand the histogram to fill using am
ggplot(mtcars, aes(x=mpg, fill=am)) +
geom_histogram(binwidth = 1)
# Change the position argument to "dodge"
ggplot(mtcars, aes(x=mpg, fill=am)) +
geom_histogram(binwidth = 1, position="dodge")
# Change the position argument to "fill"
ggplot(mtcars, aes(x=mpg, fill=am)) +
geom_histogram(binwidth = 1, position="fill")
# Change the position argument to "identity" and set alpha to 0.4
ggplot(mtcars, aes(x=mpg, fill=am)) +
geom_histogram(binwidth = 1, position="identity", alpha = 0.4)
# Change fill to cyl
ggplot(mtcars, aes(x=mpg, fill=cyl)) +
geom_histogram(binwidth = 1, position="identity", alpha = 0.4)
Next, a few examples are run from dataset car::Vocab (cached due to plotting size/time):
Vocab <- car::Vocab
str(Vocab)
## 'data.frame': 21638 obs. of 4 variables:
## $ year : int 2004 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 2 1 2 2 1 ...
## $ education : int 9 14 14 17 14 14 12 10 11 9 ...
## $ vocabulary: int 3 6 9 8 1 7 6 6 5 1 ...
# Basic scatter plot of vocabulary (y) against education (x). Use geom_point()
ggplot(Vocab, aes(x=education, y=vocabulary)) + geom_point()
# Use geom_jitter() instead of geom_point()
ggplot(Vocab, aes(x=education, y=vocabulary)) + geom_jitter()
# Using the above plotting command, set alpha to a very low 0.2
ggplot(Vocab, aes(x=education, y=vocabulary)) + geom_jitter(alpha = 0.2)
# Using the above plotting command, set the shape to 1
ggplot(Vocab, aes(x=education, y=vocabulary)) + geom_jitter(alpha = 0.2, shape=1)
# Plot education on x and vocabulary on fill
# Use the default brewed color palette
ggplot(Vocab, aes(x = education, fill = vocabulary)) +
geom_bar(position="fill") +
scale_fill_brewer()
# Definition of a set of blue colors
blues <- brewer.pal(9, "Blues")
# Make a color range using colorRampPalette() and the set of blues
blue_range <- colorRampPalette(blues)
# Use blue_range to adjust the color of the bars, use scale_fill_manual()
ggplot(Vocab, aes(x = education, fill = factor(vocabulary))) +
geom_bar(position = "fill") +
scale_fill_manual(values = blue_range(11))
Lastly, a few additional plots are displayed:
# Print out head of economics
data(economics)
head(economics)
## Source: local data frame [6 x 6]
##
## date pce pop psavert uempmed unemploy
## (date) (dbl) (int) (dbl) (dbl) (int)
## 1 1967-07-01 507.4 198712 12.5 4.5 2944
## 2 1967-08-01 510.5 198911 12.5 4.7 2945
## 3 1967-09-01 516.3 199113 11.7 4.6 2958
## 4 1967-10-01 512.9 199311 12.5 4.9 3143
## 5 1967-11-01 518.1 199498 12.5 4.7 3066
## 6 1967-12-01 525.8 199657 12.1 4.8 3018
# Plot unemploy as a function of date using a line plot
ggplot(economics, aes(x = date, y = unemploy)) + geom_line()
# Adjust plot to represent the fraction of total population that is unemployed
ggplot(economics, aes(x = date, y = unemploy/pop)) + geom_line()
recess <- data.frame(begin=as.Date(c(-31, 1400, 3652, 4199, 7486, 11382), origin="1970-01-01"), end=as.Date(c(304, 1885, 3834, 4687, 7729, 11627), origin="1970-01-01"))
ggplot(economics, aes(x = date, y = unemploy/pop)) +
geom_line() +
geom_rect(data=recess, inherit.aes=FALSE, aes(xmin=begin, xmax=end, ymin=-Inf, ymax=+Inf), fill="red", alpha=0.2)
# Cannot find dataset . . .
# Check the structure as a starting point
# str(fish.species)
# Use gather to go from fish.species to fish.tidy
# fish.tidy <- gather(fish.species, Species, Capture, -Year)
# Recreate the plot shown on the right
# ggplot(fish.tidy, aes(x = Year, y = Capture, col=Species)) + geom_line()
The qplot functionality is for making quick and dirty plots:
Basically, the qplot() is nice for just a quick and dirty analysis, though it will have much less flexibility on a go-forward basis.
Example code for qplot includes:
# The old way (shown)
plot(mpg ~ wt, data = mtcars)
# Using ggplot:
ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point()
# Using qplot:
qplot(wt, mpg, data=mtcars)
# Categorical:
# cyl
qplot(wt, mpg, data=mtcars, size=cyl)
# gear
qplot(wt, mpg, data=mtcars, size=gear)
# Continuous
# hp
qplot(wt, mpg, data=mtcars, col=hp)
# qsec
qplot(wt, mpg, data=mtcars, col=qsec)
# qplot() with x only
qplot(factor(cyl), data=mtcars)
# qplot() with x and y
qplot(factor(cyl), factor(vs), data=mtcars)
# qplot() with geom set to jitter manually
qplot(factor(cyl), factor(vs), data=mtcars, geom="jitter")
# Make a dot plot with ggplot
ggplot(mtcars, aes(x=cyl, y=wt, fill = factor(am))) +
geom_dotplot(stackdir="center", binaxis="y")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
# qplot with geom "dotplot", binaxis = "y" and stackdir = "center"
qplot(cyl, wt, fill=factor(am), data=mtcars, geom="dotplot", binaxis="y", stackdir="center")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
Course #1 wrap-up comments:
A few warp-up coding exercises for ggplot #1 include:
# Check out the head of ChickWeight
data(ChickWeight)
head(ChickWeight)
## weight Time Chick Diet
## 1 42 0 1 1
## 2 51 2 1 1
## 3 59 4 1 1
## 4 64 6 1 1
## 5 76 8 1 1
## 6 93 10 1 1
# Use ggplot() for the second instruction
ggplot(ChickWeight, aes(x=Time, y=weight)) + geom_line(aes(group=Chick))
# Use ggplot() for the third instruction
ggplot(ChickWeight, aes(x=Time, y=weight, col=Diet)) + geom_line(aes(group=Chick))
# Use ggplot() for the last instruction
ggplot(ChickWeight, aes(x=Time, y=weight, col=Diet)) + geom_line(aes(group=Chick), alpha=0.3) + geom_smooth(lwd=2, se=FALSE)
# Check out the structure of titanic
library(titanic)
## Warning: package 'titanic' was built under R version 3.2.5
library(dplyr)
titanicFull <- titanic::titanic_train
str(titanicFull)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
titanic <- titanicFull %>%
select(Pclass, Sex, Survived, Age) %>%
filter(complete.cases(.))
str(titanic)
## 'data.frame': 714 obs. of 4 variables:
## $ Pclass : int 3 1 3 1 3 1 3 3 2 3 ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Survived: int 0 1 1 1 0 0 0 1 1 1 ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
# Use ggplot() for the first instruction
ggplot(titanic, aes(x=factor(Pclass), fill=factor(Sex))) +
geom_bar(position = "dodge")
# Use ggplot() for the second instruction
ggplot(titanic, aes(x=factor(Pclass), fill=factor(Sex))) +
geom_bar(position = "dodge") +
facet_grid(. ~ Survived)
# Position jitter (use below)
posn.j <- position_jitter(0.5, 0)
# Use ggplot() for the last instruction
ggplot(titanic, aes(x=factor(Pclass), y=Age, col=factor(Sex))) +
geom_jitter(size=3, alpha=0.5, position=posn.j) +
facet_grid(. ~ Survived)
The second course expands on the remaining layers of ggplot2: Statistics, Coordinates, Facets, and Themes.
The statistics layer for ggplot2 has two basic components:
The statistics can also be called independently (outside the geom):
Example code from mtcars includes:
# Explore the mtcars data frame with str()
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# A scatter plot with LOESS smooth:
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth()
# A scatter plot with an ordinary Least Squares linear model:
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(method = "lm")
# The previous plot, without CI ribbon:
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(method = "lm", se=FALSE)
# The previous plot, without points:
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_smooth(method = "lm", se=FALSE)
# Define cyl as a factor variable
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE)
# Complete the following ggplot command as instructed
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
stat_smooth(method = "lm", se = FALSE, aes(group=1))
# Plot 1: change the LOESS span
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
# Add span below
geom_smooth(se = FALSE, span=0.7)
# Plot 2: Set the overall model to LOESS and use a span of 0.7
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
# Change method and add span below
stat_smooth(method = "loess", aes(group = 1),
se = FALSE, col = "black", span=0.7)
# Plot 3: Set col to "All", inside the aes layer of stat_smooth()
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
stat_smooth(method = "loess",
# Add col inside aes()
aes(group = 1, col="All"),
# Remove the col argument below
se = FALSE, span = 0.7)
# Plot 4: Add scale_color_manual to change the colors
myColors <- c(brewer.pal(3, "Dark2"), "black")
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE, span = 0.75) +
stat_smooth(method = "loess",
aes(group = 1, col="All"),
se = F, span = 0.7) +
# Add correct arguments to scale_color_manual
scale_color_manual("Cylinders", values=myColors)
# Display structure of mtcars
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Convert cyl and am to factors:
mtcars$cyl <- as.factor(mtcars$cyl)
mtcars$am <- as.factor(mtcars$am)
# Define positions:
posn.d <- position_dodge(width = 0.1)
posn.jd <- position_jitterdodge(jitter.width = 0.1, dodge.width = 0.2)
posn.j <- position_jitter(width = 0.2)
# base layers:
wt.cyl.am <- ggplot(mtcars, aes(x=cyl, y=wt, col=am, fill=am, group=am))
# Plot 1: Jittered, dodged scatter plot with transparent points
wt.cyl.am +
geom_point(position = posn.jd, alpha = 0.6)
# Plot 2: Mean and SD - the easy way
wt.cyl.am +
geom_point(position = posn.jd, alpha = 0.6) +
stat_summary(fun.data=mean_sdl, fun.args=list(mult=1), position=posn.d)
# Plot 3: Mean and 95% CI - the easy way
wt.cyl.am +
geom_point(position = posn.jd, alpha = 0.6) +
stat_summary(fun.data=mean_cl_normal, position=posn.d)
# Plot 4: Mean and SD - with T-tipped error bars - fill in ___
wt.cyl.am +
stat_summary(geom = "point", fun.y = mean,
position = posn.d) +
stat_summary(geom = "errorbar", fun.data = mean_sdl,
position = posn.d, fun.args = list(mult = 1), width = 0.1)
xx <- 1:100
# Function to save range for use in ggplot
gg_range <- function(x) {
# Change x below to return the instructed values
data.frame(ymin = min(x), # Min
ymax = max(x)
) # Max
}
gg_range(xx)
## ymin ymax
## 1 1 100
# Required output:
# ymin ymax
# 1 1 100
# Function to Custom function:
med_IQR <- function(x) {
# Change x below to return the instructed values
data.frame(y = median(x), # Median
ymin = quantile(x, 0.25), # 1st quartile
ymax = quantile(x, 0.75)
) # 3rd quartile
}
med_IQR(xx)
## y ymin ymax
## 25% 50.5 25.75 75.25
# Required output:
# y ymin ymax
# 25% 50.5 25.75 75.25
wt.cyl.am <- ggplot(mtcars, aes(x = cyl,y = wt, col = am, fill = am, group = am))
# Add three stat_summary calls to wt.cyl.am
wt.cyl.am +
stat_summary(geom = "linerange", fun.data = med_IQR,
position = posn.d, size = 3) +
stat_summary(geom = "linerange", fun.data = gg_range,
position = posn.d, size = 3,
alpha = 0.4) +
stat_summary(geom = "point", fun.y = median,
position = posn.d, size = 3,
col = "black", shape = "X")
Further examples (cached) from car::Vocab include:
Vocab <- car::Vocab
# Plot 1: Jittered scatter plot, add a linear model (lm) smooth:
ggplot(Vocab, aes(x = education, y = vocabulary)) +
geom_jitter(alpha = 0.2) +
stat_smooth(method="lm", se=FALSE)
# Plot 2: Only lm, colored by year
ggplot(Vocab, aes(x = education, y = vocabulary, col=factor(year))) +
stat_smooth(method="lm", se=FALSE)
# Plot 3: Set a color brewer palette
ggplot(Vocab, aes(x = education, y = vocabulary, col=factor(year))) +
stat_smooth(method="lm", se=FALSE) +
scale_color_brewer()
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors
# Plot 4: Add the group, specify alpha and size
ggplot(Vocab, aes(x = education, y = vocabulary, col = year, group=factor(year))) +
stat_smooth(method = "lm", se = FALSE, alpha=0.6, size=2) +
scale_color_gradientn(colors = brewer.pal(9,"YlOrRd"))
# Use stat_quantile instead of stat_smooth:
ggplot(Vocab, aes(x = education, y = vocabulary, col = year, group = factor(year))) +
stat_quantile(alpha = 0.6, size = 2) +
scale_color_gradientn(colors = brewer.pal(9,"YlOrRd"))
## Warning: package 'quantreg' was built under R version 3.2.4
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
# Set quantile to 0.5:
ggplot(Vocab, aes(x = education, y = vocabulary, col = year, group = factor(year))) +
stat_quantile(alpha = 0.6, size = 2, quantiles=c(0.5)) +
scale_color_gradientn(colors = brewer.pal(9,"YlOrRd"))
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
# Plot with linear and loess model
p <- ggplot(Vocab, aes(x = education, y = vocabulary)) +
stat_smooth(method = "loess", aes(col = "red"), se = F) +
stat_smooth(method = "lm", aes(col = "blue"), se = F) +
scale_color_discrete("Model", labels = c("red" = "LOESS", "blue" = "lm"))
# Add stat_sum
p + stat_sum()
# Add stat_sum and set size range
p + stat_sum() + scale_size(range = c(1, 10))
The coordinates layers control the plot dimensions:
The facets are based on the concept of small multiples as per the Tufte book on “Visulaization of Quantitative Information” (1983):
Example code includes:
data(mtcars);
mtcars$cyl <- as.factor(mtcars$cyl);
mtcars$am <- as.factor(mtcars$am)
# Basic ggplot() command, coded for you
p <- ggplot(mtcars, aes(x = wt, y = hp, col = am)) + geom_point() + geom_smooth()
# Add scale_x_continuous
p + scale_x_continuous(limits = c(3,6), expand=c(0,0))
## Warning: Removed 12 rows containing non-finite values (stat_smooth).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 3.168
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4e-006
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 3.168
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.002
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 3.572
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4e-006
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4e-006
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning: Removed 12 rows containing missing values (geom_point).
# The proper way to zoom in:
p + coord_cartesian(xlim=c(3, 6))
data(iris)
# Complete basic scatter plot function
base.plot <- ggplot(iris, aes(x=Sepal.Length, y=Sepal.Width, col=Species)) +
geom_jitter() +
geom_smooth(method = "lm", se = FALSE)
# Plot base.plot: default aspect ratio
base.plot
# Fix aspect ratio (1:1) of base.plot
base.plot + coord_equal()
# Create stacked bar plot: thin.bar
thin.bar <- ggplot(mtcars, aes(x=1, fill=cyl)) +
geom_bar()
# Convert thin.bar to pie chart
thin.bar + coord_polar(theta = "y")
# Create stacked bar plot: wide.bar
wide.bar <- ggplot(mtcars, aes(x=1, fill=cyl)) +
geom_bar(width=1)
# Convert wide.bar to pie chart
wide.bar + coord_polar(theta="y")
# Basic scatter plot:
p <- ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point()
# Separate rows according to transmission type, am
p + facet_grid(am ~ .)
# Separate columns according to cylinders, cyl
p + facet_grid(. ~ cyl)
# Separate by both columns and rows
p + facet_grid(am ~ cyl)
# Code to create the cyl_am col and myCol vector
mtcars$cyl_am <- paste(mtcars$cyl, mtcars$am, sep = "_")
myCol <- rbind(brewer.pal(9, "Blues")[c(3,6,8)],
brewer.pal(9, "Reds")[c(3,6,8)])
# Basic scatter plot, add color scale:
ggplot(mtcars, aes(x = wt, y = mpg, col=cyl_am)) +
geom_point() + scale_color_manual(values = myCol)
# Facet according on rows and columns.
ggplot(mtcars, aes(x = wt, y = mpg, col=cyl_am)) +
geom_point() + scale_color_manual(values = myCol) +
facet_grid(gear ~ vs)
# Add more variables
ggplot(mtcars, aes(x = wt, y = mpg, col=cyl_am, size=disp)) +
geom_point() + scale_color_manual(values = myCol) +
facet_grid(gear ~ vs)
mamsleep <- tidyr::gather(ggplot2::msleep %>%
mutate(total = sleep_total, rem=sleep_rem) %>%
select(vore, name, total, rem) %>%
filter(!is.na(total), !is.na(rem)),
sleep, time, -c(vore, name))
mamsleep$sleep <- factor(mamsleep$sleep, levels=c("total", "rem"))
str(mamsleep)
## Classes 'tbl_df', 'tbl' and 'data.frame': 122 obs. of 4 variables:
## $ vore : chr "omni" "herbi" "omni" "herbi" ...
## $ name : chr "Owl monkey" "Mountain beaver" "Greater short-tailed shrew" "Cow" ...
## $ sleep: Factor w/ 2 levels "total","rem": 1 1 1 1 1 1 1 1 1 1 ...
## $ time : num 17 14.4 14.9 4 14.4 8.7 10.1 5.3 9.4 10 ...
# Basic scatter plot
ggplot(mamsleep, aes(x=time, y=name, col=sleep)) + geom_point()
# Facet rows accoding to vore
ggplot(mamsleep, aes(x=time, y=name, col=sleep)) + geom_point() + facet_grid(vore ~ .)
# Specify scale and space arguments to free up rows
ggplot(mamsleep, aes(x=time, y=name, col=sleep)) + geom_point() +
facet_grid(vore ~ ., scale="free_y", space="free_y")